]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
changed format of builtin shader strings to use comma separated lines,
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int viewport[4];
253         int scissor[4];
254         float depthrange[2];
255         float polygonoffset[2];
256         float clipplane[4];
257         ALIGN(float fb_clipplane[4]);
258
259         int shader_mode;
260         int shader_permutation;
261         int shader_exactspecularmath;
262
263         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
264         
265         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
267
268         // DPSOFTRAST_VALIDATE_ flags
269         int validate;
270
271         // derived values (DPSOFTRAST_VALIDATE_FB)
272         int fb_colormask;
273         int fb_scissor[4];
274         ALIGN(float fb_viewportcenter[4]);
275         ALIGN(float fb_viewportscale[4]);
276
277         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278         int fb_depthfunc;
279
280         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
281         int fb_blendmode;
282
283         // band boundaries
284         int miny1;
285         int maxy1;
286         int miny2;
287         int maxy2;
288
289         ATOMIC(volatile int commandoffset);
290
291         volatile bool waiting;
292         volatile bool starving;
293         void *waitcond;
294         void *drawcond;
295         void *drawmutex;
296
297         int numspans;
298         int numtriangles;
299         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
302 }
303 DPSOFTRAST_State_Thread);
304
305 typedef ALIGN(struct DPSOFTRAST_State_s
306 {
307         int fb_width;
308         int fb_height;
309         unsigned int *fb_depthpixels;
310         unsigned int *fb_colorpixels[4];
311
312         int viewport[4];
313         ALIGN(float fb_viewportcenter[4]);
314         ALIGN(float fb_viewportscale[4]);
315
316         float color[4];
317         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
319
320         const float *pointer_vertex3f;
321         const float *pointer_color4f;
322         const unsigned char *pointer_color4ub;
323         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324         int stride_vertex;
325         int stride_color;
326         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
329
330         int firstvertex;
331         int numvertices;
332         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333         float *screencoord4f;
334         int drawstarty;
335         int drawendy;
336         int drawclipped;
337         
338         int shader_mode;
339         int shader_permutation;
340         int shader_exactspecularmath;
341
342         int texture_max;
343         int texture_end;
344         int texture_firstfree;
345         DPSOFTRAST_Texture *texture;
346
347         int bigendian;
348
349         // error reporting
350         const char *errorstring;
351
352         bool usethreads;
353         int interlace;
354         int numthreads;
355         DPSOFTRAST_State_Thread *threads;
356
357         ATOMIC(volatile int drawcommand);
358
359         DPSOFTRAST_State_Command_Pool commandpool;
360 }
361 DPSOFTRAST_State);
362
363 DPSOFTRAST_State dpsoftrast;
364
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
369
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
372
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
374 {
375         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377         fb_viewportcenter[3] = 0.5f;
378         fb_viewportcenter[0] = 0.0f;
379         fb_viewportscale[1] = 0.5f * viewport[2];
380         fb_viewportscale[2] = -0.5f * viewport[3];
381         fb_viewportscale[3] = 0.5f;
382         fb_viewportscale[0] = 1.0f;
383 }
384
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
386 {
387         if (dpsoftrast.interlace)
388         {
389                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393         }
394         else
395         {
396                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
398         }
399 }
400
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
402 {
403         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 }
409
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
411 {
412         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413         // and viewport projection values
414         int x1, x2;
415         int y1, y2;
416         x1 = thread->scissor[0];
417         x2 = thread->scissor[0] + thread->scissor[2];
418         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419         y2 = dpsoftrast.fb_height - thread->scissor[1];
420         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
421         if (x1 < 0) x1 = 0;
422         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
423         if (y1 < 0) y1 = 0;
424         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425         thread->fb_scissor[0] = x1;
426         thread->fb_scissor[1] = y1;
427         thread->fb_scissor[2] = x2 - x1;
428         thread->fb_scissor[3] = y2 - y1;
429
430         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431         DPSOFTRAST_RecalcClipPlane(thread);
432         DPSOFTRAST_RecalcThread(thread);
433 }
434
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
436 {
437         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 }
439
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
441 {
442         if (thread->blendsubtract)
443         {
444                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445                 {
446                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450                 }
451         }
452         else
453         {       
454                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
455                 {
456                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
467                 }
468         }
469 }
470
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
472
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
474 {
475         mask &= thread->validate;
476         if (!mask)
477                 return;
478         if (mask & DPSOFTRAST_VALIDATE_FB)
479         {
480                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481                 DPSOFTRAST_RecalcFB(thread);
482         }
483         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
484         {
485                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486                 DPSOFTRAST_RecalcDepthFunc(thread);
487         }
488         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
489         {
490                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491                 DPSOFTRAST_RecalcBlendFunc(thread);
492         }
493 }
494
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
496 {
497         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498                 return &dpsoftrast.texture[index];
499         return NULL;
500 }
501
502 static void DPSOFTRAST_Texture_Grow(void)
503 {
504         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505         DPSOFTRAST_State_Thread *thread;
506         int i;
507         int j;
508         DPSOFTRAST_Flush();
509         // expand texture array as needed
510         if (dpsoftrast.texture_max < 1024)
511                 dpsoftrast.texture_max = 1024;
512         else
513                 dpsoftrast.texture_max *= 2;
514         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516                 if (dpsoftrast.texbound[i])
517                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518         for (j = 0; j < dpsoftrast.numthreads; j++)
519         {
520                 thread = &dpsoftrast.threads[j];
521                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522                         if (thread->texbound[i])
523                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
524         }
525 }
526
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
528 {
529         int w;
530         int h;
531         int d;
532         int size;
533         int s;
534         int texnum;
535         int mipmaps;
536         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538         DPSOFTRAST_Texture *texture;
539         if (width*height*depth < 1)
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542                 return 0;
543         }
544         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
545         {
546                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
547                 return 0;
548         }
549         switch(texformat)
550         {
551         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
554                 break;
555         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
557                 {
558                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
559                         return 0;
560                 }
561                 if (depth != 1)
562                 {
563                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564                         return 0;
565                 }
566                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
567                 {
568                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
569                         return 0;
570                 }
571                 break;
572         }
573         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576                 return 0;
577         }
578         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581                 return 0;
582         }
583         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
584         {
585                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586                 return 0;
587         }
588         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
589         {
590                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591                 return 0;
592         }
593         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
594         {
595                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596                 return 0;
597         }
598         // find first empty slot in texture array
599         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600                 if (!dpsoftrast.texture[texnum].bytes)
601                         break;
602         dpsoftrast.texture_firstfree = texnum + 1;
603         if (dpsoftrast.texture_max <= texnum)
604                 DPSOFTRAST_Texture_Grow();
605         if (dpsoftrast.texture_end <= texnum)
606                 dpsoftrast.texture_end = texnum + 1;
607         texture = &dpsoftrast.texture[texnum];
608         memset(texture, 0, sizeof(*texture));
609         texture->flags = flags;
610         texture->width = width;
611         texture->height = height;
612         texture->depth = depth;
613         texture->sides = sides;
614         texture->binds = 0;
615         w = width;
616         h = height;
617         d = depth;
618         size = 0;
619         mipmaps = 0;
620         for (;;)
621         {
622                 s = w * h * d * sides * 4;
623                 texture->mipmap[mipmaps][0] = size;
624                 texture->mipmap[mipmaps][1] = s;
625                 texture->mipmap[mipmaps][2] = w;
626                 texture->mipmap[mipmaps][3] = h;
627                 texture->mipmap[mipmaps][4] = d;
628                 size += s;
629                 mipmaps++;
630                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
631                         break;
632                 if (w > 1) w >>= 1;
633                 if (h > 1) h >>= 1;
634                 if (d > 1) d >>= 1;
635         }
636         texture->mipmaps = mipmaps;
637         texture->size = size;
638
639         // allocate the pixels now
640         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
641
642         return texnum;
643 }
644 void DPSOFTRAST_Texture_Free(int index)
645 {
646         DPSOFTRAST_Texture *texture;
647         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
648         if (texture->binds)
649                 DPSOFTRAST_Flush();
650         if (texture->bytes)
651                 MM_FREE(texture->bytes);
652         texture->bytes = NULL;
653         memset(texture, 0, sizeof(*texture));
654         // adjust the free range and used range
655         if (dpsoftrast.texture_firstfree > index)
656                 dpsoftrast.texture_firstfree = index;
657         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
658                 dpsoftrast.texture_end--;
659 }
660 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
661 {
662         int i, x, y, z, w, layer0, layer1, row0, row1;
663         unsigned char *o, *i0, *i1, *i2, *i3;
664         DPSOFTRAST_Texture *texture;
665         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
666         if (texture->mipmaps <= 1)
667                 return;
668         for (i = 1;i < texture->mipmaps;i++)
669         {
670                 for (z = 0;z < texture->mipmap[i][4];z++)
671                 {
672                         layer0 = z*2;
673                         layer1 = z*2+1;
674                         if (layer1 >= texture->mipmap[i-1][4])
675                                 layer1 = texture->mipmap[i-1][4]-1;
676                         for (y = 0;y < texture->mipmap[i][3];y++)
677                         {
678                                 row0 = y*2;
679                                 row1 = y*2+1;
680                                 if (row1 >= texture->mipmap[i-1][3])
681                                         row1 = texture->mipmap[i-1][3]-1;
682                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
683                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
684                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
685                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
686                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
687                                 w = texture->mipmap[i][2];
688                                 if (layer1 > layer0)
689                                 {
690                                         if (texture->mipmap[i-1][2] > 1)
691                                         {
692                                                 // average 3D texture
693                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
694                                                 {
695                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
696                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
697                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
698                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
699                                                 }
700                                         }
701                                         else
702                                         {
703                                                 // average 3D mipmap with parent width == 1
704                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
705                                                 {
706                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
707                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
708                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
709                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
710                                                 }
711                                         }
712                                 }
713                                 else
714                                 {
715                                         if (texture->mipmap[i-1][2] > 1)
716                                         {
717                                                 // average 2D texture (common case)
718                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
719                                                 {
720                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
721                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
722                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
723                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
724                                                 }
725                                         }
726                                         else
727                                         {
728                                                 // 2D texture with parent width == 1
729                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
730                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
731                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
732                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
733                                         }
734                                 }
735                         }
736                 }
737         }
738 }
739 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
740 {
741         DPSOFTRAST_Texture *texture;
742         unsigned char *dst;
743         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
744         if (texture->binds)
745                 DPSOFTRAST_Flush();
746         if (pixels)
747         {
748                 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
749                 while (blockheight > 0)
750                 {
751                         dst -= texture->mipmap[0][2] * 4;
752                         memcpy(dst, pixels, blockwidth * 4);
753                         pixels += blockwidth * 4;
754                         blockheight--;
755                 }
756         }
757         DPSOFTRAST_Texture_CalculateMipmaps(index);
758 }
759 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
760 {
761         DPSOFTRAST_Texture *texture;
762         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
763         if (texture->binds)
764                 DPSOFTRAST_Flush();
765         if (pixels)
766         {
767                 int i, stride = texture->mipmap[0][2]*4;
768                 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
769                 for (i = texture->mipmap[0][3];i > 0;i--)
770                 {
771                         dst -= stride;
772                         memcpy(dst, pixels, stride);
773                         pixels += stride;
774                 }
775         }
776         DPSOFTRAST_Texture_CalculateMipmaps(index);
777 }
778 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
779 {
780         DPSOFTRAST_Texture *texture;
781         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782         return texture->mipmap[mip][2];
783 }
784 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
785 {
786         DPSOFTRAST_Texture *texture;
787         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788         return texture->mipmap[mip][3];
789 }
790 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
791 {
792         DPSOFTRAST_Texture *texture;
793         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
794         return texture->mipmap[mip][4];
795 }
796 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
797 {
798         DPSOFTRAST_Texture *texture;
799         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
800         if (texture->binds)
801                 DPSOFTRAST_Flush();
802         return texture->bytes + texture->mipmap[mip][0];
803 }
804 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
805 {
806         DPSOFTRAST_Texture *texture;
807         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
808         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
809         {
810                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
811                 return;
812         }
813         if (texture->binds)
814                 DPSOFTRAST_Flush();
815         texture->filter = filter;
816 }
817
818 static void DPSOFTRAST_Draw_FlushThreads(void);
819
820 static void DPSOFTRAST_Draw_SyncCommands(void)
821 {
822         if(dpsoftrast.usethreads) MEMORY_BARRIER;
823         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
824 }
825
826 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
827 {
828         DPSOFTRAST_State_Thread *thread;
829         int i;
830         int freecommand = dpsoftrast.commandpool.freecommand;
831         int usedcommands = dpsoftrast.commandpool.usedcommands;
832         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
833                 return;
834         DPSOFTRAST_Draw_SyncCommands();
835         for(;;)
836         {
837                 int waitindex = -1;
838                 int commandoffset;
839                 usedcommands = 0;
840                 for (i = 0; i < dpsoftrast.numthreads; i++)
841                 {
842                         thread = &dpsoftrast.threads[i]; 
843                         commandoffset = freecommand - thread->commandoffset;
844                         if (commandoffset < 0)
845                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
846                         if (commandoffset > usedcommands)
847                         {
848                                 waitindex = i;
849                                 usedcommands = commandoffset;
850                         }
851                 }
852                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
853                         break;
854                 thread = &dpsoftrast.threads[waitindex];
855                 Thread_LockMutex(thread->drawmutex);
856                 if (thread->commandoffset != dpsoftrast.drawcommand)
857                 {
858                         thread->waiting = true;
859                         if (thread->starving) Thread_CondSignal(thread->drawcond);
860                         Thread_CondWait(thread->waitcond, thread->drawmutex);
861                         thread->waiting = false;
862                 }
863                 Thread_UnlockMutex(thread->drawmutex);
864         }
865         dpsoftrast.commandpool.usedcommands = usedcommands;
866 }
867
868 #define DPSOFTRAST_ALIGNCOMMAND(size) \
869         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
870 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
871         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
872
873 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
874 {
875         DPSOFTRAST_Command *command;
876         int freecommand = dpsoftrast.commandpool.freecommand;
877         int usedcommands = dpsoftrast.commandpool.usedcommands;
878         int extra = sizeof(DPSOFTRAST_Command);
879         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
880                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
881         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
882         {
883                 if (dpsoftrast.usethreads)
884                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
885                 else
886                         DPSOFTRAST_Draw_FlushThreads();
887                 freecommand = dpsoftrast.commandpool.freecommand;
888                 usedcommands = dpsoftrast.commandpool.usedcommands;
889         }
890         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
891         {
892                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
893                 command->opcode = DPSOFTRAST_OPCODE_Reset;
894                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
895                 freecommand = 0;
896         }
897         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
898         command->opcode = opcode;
899         command->commandsize = size;
900         freecommand += size;
901         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
902                 freecommand = 0;
903         dpsoftrast.commandpool.freecommand = freecommand;
904         dpsoftrast.commandpool.usedcommands = usedcommands + size;
905         return command;
906 }
907
908 static void DPSOFTRAST_UndoCommand(int size)
909 {
910         int freecommand = dpsoftrast.commandpool.freecommand;
911         int usedcommands = dpsoftrast.commandpool.usedcommands;
912         freecommand -= size;
913         if (freecommand < 0)
914                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
915         usedcommands -= size;
916         dpsoftrast.commandpool.freecommand = freecommand;
917         dpsoftrast.commandpool.usedcommands = usedcommands;
918 }
919                 
920 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
921 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
922 {
923         thread->viewport[0] = command->x;
924         thread->viewport[1] = command->y;
925         thread->viewport[2] = command->width;
926         thread->viewport[3] = command->height;
927         thread->validate |= DPSOFTRAST_VALIDATE_FB;
928 }
929 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
930 {
931         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
932         command->x = x;
933         command->y = y;
934         command->width = width;
935         command->height = height;
936
937         dpsoftrast.viewport[0] = x;
938         dpsoftrast.viewport[1] = y;
939         dpsoftrast.viewport[2] = width;
940         dpsoftrast.viewport[3] = height;
941         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
942 }
943
944 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
945 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
946 {
947         int i, x1, y1, x2, y2, w, h, x, y;
948         int miny1, maxy1, miny2, maxy2;
949         int bandy;
950         unsigned int *p;
951         unsigned int c;
952         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
953         miny1 = thread->miny1;
954         maxy1 = thread->maxy1;
955         miny2 = thread->miny2;
956         maxy2 = thread->maxy2;
957         x1 = thread->fb_scissor[0];
958         y1 = thread->fb_scissor[1];
959         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
960         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
961         if (y1 < miny1) y1 = miny1;
962         if (y2 > maxy2) y2 = maxy2;
963         w = x2 - x1;
964         h = y2 - y1;
965         if (w < 1 || h < 1)
966                 return;
967         // FIXME: honor fb_colormask?
968         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
969         for (i = 0;i < 4;i++)
970         {
971                 if (!dpsoftrast.fb_colorpixels[i])
972                         continue;
973                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
974                 for (;y < bandy;y++)
975                 {
976                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
977                         for (x = x1;x < x2;x++)
978                                 p[x] = c;
979                 }
980         }
981 }
982 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
983 {
984         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
985         command->r = r;
986         command->g = g;
987         command->b = b;
988         command->a = a;
989 }
990
991 DEFCOMMAND(3, ClearDepth, float depth;)
992 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
993 {
994         int x1, y1, x2, y2, w, h, x, y;
995         int miny1, maxy1, miny2, maxy2;
996         int bandy;
997         unsigned int *p;
998         unsigned int c;
999         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1000         miny1 = thread->miny1;
1001         maxy1 = thread->maxy1;
1002         miny2 = thread->miny2;
1003         maxy2 = thread->maxy2;
1004         x1 = thread->fb_scissor[0];
1005         y1 = thread->fb_scissor[1];
1006         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1007         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1008         if (y1 < miny1) y1 = miny1;
1009         if (y2 > maxy2) y2 = maxy2;
1010         w = x2 - x1;
1011         h = y2 - y1;
1012         if (w < 1 || h < 1)
1013                 return;
1014         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1015         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1016         for (;y < bandy;y++)
1017         {
1018                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1019                 for (x = x1;x < x2;x++)
1020                         p[x] = c;
1021         }
1022 }
1023 void DPSOFTRAST_ClearDepth(float d)
1024 {
1025         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1026         command->depth = d;
1027 }
1028
1029 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1030 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1031 {
1032         thread->colormask[0] = command->r != 0;
1033         thread->colormask[1] = command->g != 0;
1034         thread->colormask[2] = command->b != 0;
1035         thread->colormask[3] = command->a != 0;
1036         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1037 }
1038 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1039 {
1040         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1041         command->r = r;
1042         command->g = g;
1043         command->b = b;
1044         command->a = a;
1045 }
1046
1047 DEFCOMMAND(5, DepthTest, int enable;)
1048 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1049 {
1050         thread->depthtest = command->enable;
1051         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1052 }
1053 void DPSOFTRAST_DepthTest(int enable)
1054 {
1055         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1056         command->enable = enable;
1057 }
1058
1059 DEFCOMMAND(6, ScissorTest, int enable;)
1060 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1061 {
1062         thread->scissortest = command->enable;
1063         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1064 }
1065 void DPSOFTRAST_ScissorTest(int enable)
1066 {
1067         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1068         command->enable = enable;
1069 }
1070
1071 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1072 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1073 {
1074         thread->scissor[0] = command->x;
1075         thread->scissor[1] = command->y;
1076         thread->scissor[2] = command->width;
1077         thread->scissor[3] = command->height;
1078         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1079 }
1080 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1081 {
1082         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1083         command->x = x;
1084         command->y = y;
1085         command->width = width;
1086         command->height = height;
1087 }
1088
1089 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1090 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1091 {
1092         thread->blendfunc[0] = command->sfactor;
1093         thread->blendfunc[1] = command->dfactor;
1094         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1095 }
1096 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1097 {
1098         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1099         command->sfactor = sfactor;
1100         command->dfactor = dfactor;
1101 }
1102
1103 DEFCOMMAND(9, BlendSubtract, int enable;)
1104 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1105 {
1106         thread->blendsubtract = command->enable;
1107         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1108 }
1109 void DPSOFTRAST_BlendSubtract(int enable)
1110 {
1111         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1112         command->enable = enable;
1113 }
1114
1115 DEFCOMMAND(10, DepthMask, int enable;)
1116 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1117 {
1118         thread->depthmask = command->enable;
1119 }
1120 void DPSOFTRAST_DepthMask(int enable)
1121 {
1122         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1123         command->enable = enable;
1124 }
1125
1126 DEFCOMMAND(11, DepthFunc, int func;)
1127 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1128 {
1129         thread->depthfunc = command->func;
1130 }
1131 void DPSOFTRAST_DepthFunc(int func)
1132 {
1133         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1134         command->func = func;
1135 }
1136
1137 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1138 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1139 {
1140         thread->depthrange[0] = command->nearval;
1141         thread->depthrange[1] = command->farval;
1142 }
1143 void DPSOFTRAST_DepthRange(float nearval, float farval)
1144 {
1145         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1146         command->nearval = nearval;
1147         command->farval = farval;
1148 }
1149
1150 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1151 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1152 {
1153         thread->polygonoffset[0] = command->alongnormal;
1154         thread->polygonoffset[1] = command->intoview;
1155 }
1156 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1157 {
1158         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1159         command->alongnormal = alongnormal;
1160         command->intoview = intoview;
1161 }
1162
1163 DEFCOMMAND(14, CullFace, int mode;)
1164 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1165 {
1166         thread->cullface = command->mode;
1167 }
1168 void DPSOFTRAST_CullFace(int mode)
1169 {
1170         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1171         command->mode = mode;
1172 }
1173
1174 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1175 {
1176         dpsoftrast.color[0] = r;
1177         dpsoftrast.color[1] = g;
1178         dpsoftrast.color[2] = b;
1179         dpsoftrast.color[3] = a;
1180 }
1181
1182 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1183 {
1184         int outstride = blockwidth * 4;
1185         int instride = dpsoftrast.fb_width * 4;
1186         int bx1 = blockx;
1187         int by1 = blocky;
1188         int bx2 = blockx + blockwidth;
1189         int by2 = blocky + blockheight;
1190         int bw;
1191         int x;
1192         int y;
1193         unsigned char *inpixels;
1194         unsigned char *b;
1195         unsigned char *o;
1196         DPSOFTRAST_Flush();
1197         if (bx1 < 0) bx1 = 0;
1198         if (by1 < 0) by1 = 0;
1199         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1200         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1201         bw = bx2 - bx1;
1202         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1203         if (dpsoftrast.bigendian)
1204         {
1205                 for (y = by1;y < by2;y++)
1206                 {
1207                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1208                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1209                         for (x = bx1;x < bx2;x++)
1210                         {
1211                                 o[0] = b[3];
1212                                 o[1] = b[2];
1213                                 o[2] = b[1];
1214                                 o[3] = b[0];
1215                                 o += 4;
1216                                 b += 4;
1217                         }
1218                 }
1219         }
1220         else
1221         {
1222                 for (y = by1;y < by2;y++)
1223                 {
1224                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1225                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1226                         memcpy(o, b, bw*4);
1227                 }
1228         }
1229
1230 }
1231 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1232 {
1233         int tx1 = tx;
1234         int ty1 = ty;
1235         int tx2 = tx + width;
1236         int ty2 = ty + height;
1237         int sx1 = sx;
1238         int sy1 = sy;
1239         int sx2 = sx + width;
1240         int sy2 = sy + height;
1241         int swidth;
1242         int sheight;
1243         int twidth;
1244         int theight;
1245         int sw;
1246         int sh;
1247         int tw;
1248         int th;
1249         int y;
1250         unsigned int *spixels;
1251         unsigned int *tpixels;
1252         DPSOFTRAST_Texture *texture;
1253         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1254         if (mip < 0 || mip >= texture->mipmaps) return;
1255         DPSOFTRAST_Flush();
1256         spixels = dpsoftrast.fb_colorpixels[0];
1257         swidth = dpsoftrast.fb_width;
1258         sheight = dpsoftrast.fb_height;
1259         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1260         twidth = texture->mipmap[mip][2];
1261         theight = texture->mipmap[mip][3];
1262         if (tx1 < 0) tx1 = 0;
1263         if (ty1 < 0) ty1 = 0;
1264         if (tx2 > twidth) tx2 = twidth;
1265         if (ty2 > theight) ty2 = theight;
1266         if (sx1 < 0) sx1 = 0;
1267         if (sy1 < 0) sy1 = 0;
1268         if (sx2 > swidth) sx2 = swidth;
1269         if (sy2 > sheight) sy2 = sheight;
1270         tw = tx2 - tx1;
1271         th = ty2 - ty1;
1272         sw = sx2 - sx1;
1273         sh = sy2 - sy1;
1274         if (tw > sw) tw = sw;
1275         if (th > sh) th = sh;
1276         if (tw < 1 || th < 1)
1277                 return;
1278         sy1 = sheight - sy1 - th;
1279         ty1 = theight - ty1 - th;
1280         for (y = 0;y < th;y++)
1281                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1282         if (texture->mipmaps > 1)
1283                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1284 }
1285
1286 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1287 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1288 {
1289         if (thread->texbound[command->unitnum])
1290                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1291         thread->texbound[command->unitnum] = command->texture;
1292 }
1293 void DPSOFTRAST_SetTexture(int unitnum, int index)
1294 {
1295         DPSOFTRAST_Command_SetTexture *command;
1296         DPSOFTRAST_Texture *texture;
1297         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1298         {
1299                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1300                 return;
1301         }
1302         texture = DPSOFTRAST_Texture_GetByIndex(index);
1303         if (index && !texture)
1304         {
1305                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1306                 return;
1307         }
1308
1309         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1310         command->unitnum = unitnum;
1311         command->texture = texture;
1312
1313         dpsoftrast.texbound[unitnum] = texture;
1314         if (texture)
1315                 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1316 }
1317
1318 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1319 {
1320         dpsoftrast.pointer_vertex3f = vertex3f;
1321         dpsoftrast.stride_vertex = stride;
1322 }
1323 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1324 {
1325         dpsoftrast.pointer_color4f = color4f;
1326         dpsoftrast.pointer_color4ub = NULL;
1327         dpsoftrast.stride_color = stride;
1328 }
1329 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1330 {
1331         dpsoftrast.pointer_color4f = NULL;
1332         dpsoftrast.pointer_color4ub = color4ub;
1333         dpsoftrast.stride_color = stride;
1334 }
1335 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1336 {
1337         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1338         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1339         dpsoftrast.stride_texcoord[unitnum] = stride;
1340 }
1341
1342 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1343 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1344 {
1345         thread->shader_mode = command->mode;
1346         thread->shader_permutation = command->permutation;
1347         thread->shader_exactspecularmath = command->exactspecularmath;
1348 }
1349 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1350 {
1351         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1352         command->mode = mode;
1353         command->permutation = permutation;
1354         command->exactspecularmath = exactspecularmath;
1355
1356         dpsoftrast.shader_mode = mode;
1357         dpsoftrast.shader_permutation = permutation;
1358         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1359 }
1360
1361 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1362 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1363 {
1364         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1365 }
1366 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1367 {
1368         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1369         command->index = index;
1370         command->val[0] = v0;
1371         command->val[1] = v1;
1372         command->val[2] = v2;
1373         command->val[3] = v3;
1374
1375         dpsoftrast.uniform4f[index*4+0] = v0;
1376         dpsoftrast.uniform4f[index*4+1] = v1;
1377         dpsoftrast.uniform4f[index*4+2] = v2;
1378         dpsoftrast.uniform4f[index*4+3] = v3;
1379 }
1380 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1381 {
1382         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1383         command->index = index;
1384         memcpy(command->val, v, sizeof(command->val));
1385
1386         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1387 }
1388
1389 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1390 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1391 {
1392         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1393 }
1394 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1395 {
1396 #ifdef SSE_POSSIBLE
1397         int i, index;
1398         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1399         {
1400                 __m128 m0, m1, m2, m3;
1401                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1402                 command->index = (DPSOFTRAST_UNIFORM)index;
1403                 if (((size_t)v)&(ALIGN_SIZE-1))
1404                 {
1405                         m0 = _mm_loadu_ps(v);
1406                         m1 = _mm_loadu_ps(v+4);
1407                         m2 = _mm_loadu_ps(v+8);
1408                         m3 = _mm_loadu_ps(v+12);
1409                 }
1410                 else
1411                 {
1412                         m0 = _mm_load_ps(v);
1413                         m1 = _mm_load_ps(v+4);
1414                         m2 = _mm_load_ps(v+8);
1415                         m3 = _mm_load_ps(v+12);
1416                 }
1417                 if (transpose)
1418                 {
1419                         __m128 t0, t1, t2, t3;
1420                         t0 = _mm_unpacklo_ps(m0, m1);
1421                         t1 = _mm_unpacklo_ps(m2, m3);
1422                         t2 = _mm_unpackhi_ps(m0, m1);
1423                         t3 = _mm_unpackhi_ps(m2, m3);
1424                         m0 = _mm_movelh_ps(t0, t1);
1425                         m1 = _mm_movehl_ps(t1, t0);
1426                         m2 = _mm_movelh_ps(t2, t3);
1427                         m3 = _mm_movehl_ps(t3, t2);                     
1428                 }
1429                 _mm_store_ps(command->val, m0);
1430                 _mm_store_ps(command->val+4, m1);
1431                 _mm_store_ps(command->val+8, m2);
1432                 _mm_store_ps(command->val+12, m3);
1433                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1434                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1435                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1436                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1437         }
1438 #endif
1439 }
1440
1441 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1442 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1443 {
1444         thread->uniform1i[command->index] = command->val;
1445 }
1446 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1447 {
1448         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1449         command->index = index;
1450         command->val = i0;
1451
1452         dpsoftrast.uniform1i[command->index] = i0;
1453 }
1454
1455 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1456 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1457 {
1458         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1459         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1460 }
1461 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1462 {
1463         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1464         command->clipplane[0] = x;
1465         command->clipplane[1] = y;
1466         command->clipplane[2] = z;
1467         command->clipplane[3] = w;
1468 }
1469
1470 #ifdef SSE_POSSIBLE
1471 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1472 {
1473         float *end = dst + size*4;
1474         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1475         {
1476                 while (dst < end)
1477                 {
1478                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1479                         dst += 4;
1480                         src += stride;
1481                 }
1482         }
1483         else
1484         {
1485                 while (dst < end)
1486                 {
1487                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1488                         dst += 4;
1489                         src += stride;
1490                 }
1491         }
1492 }
1493
1494 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1495 {
1496         float *end = dst + size*4;
1497         if (stride == sizeof(float[3]))
1498         {
1499                 float *end4 = dst + (size&~3)*4;        
1500                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1501                 {
1502                         while (dst < end4)
1503                         {
1504                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1505                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1506                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1507                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1508                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1509                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1510                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1512                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1513                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1514                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1515                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1516                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1517                                 dst += 16;
1518                                 src += 4*sizeof(float[3]);
1519                         }
1520                 }
1521                 else
1522                 {
1523                         while (dst < end4)
1524                         {
1525                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1526                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1527                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1528                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1529                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1530                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1531                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1533                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1534                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1535                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1536                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1537                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1538                                 dst += 16;
1539                                 src += 4*sizeof(float[3]);
1540                         }
1541                 }
1542         }
1543         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1544         {
1545                 while (dst < end)
1546                 {
1547                         __m128 v = _mm_loadu_ps((const float *)src);
1548                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1549                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1550                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1551                         _mm_store_ps(dst, v);
1552                         dst += 4;
1553                         src += stride;
1554                 }
1555         }
1556         else
1557         {
1558                 while (dst < end)
1559                 {
1560                         __m128 v = _mm_load_ps((const float *)src);
1561                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1562                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1563                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1564                         _mm_store_ps(dst, v);
1565                         dst += 4;
1566                         src += stride;
1567                 }
1568         }
1569 }
1570
1571 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1572 {
1573         float *end = dst + size*4;
1574         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1575         if (stride == sizeof(float[2]))
1576         {
1577                 float *end2 = dst + (size&~1)*4;
1578                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1579                 {
1580                         while (dst < end2)
1581                         {
1582                                 __m128 v = _mm_loadu_ps((const float *)src);
1583                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1584                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1585                                 dst += 8;
1586                                 src += 2*sizeof(float[2]);
1587                         }
1588                 }
1589                 else
1590                 {
1591                         while (dst < end2)
1592                         {
1593                                 __m128 v = _mm_load_ps((const float *)src);
1594                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1595                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1596                                 dst += 8;
1597                                 src += 2*sizeof(float[2]);
1598                         }
1599                 }
1600         }
1601         while (dst < end)
1602         {
1603                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1604                 dst += 4;
1605                 src += stride;
1606         }
1607 }
1608
1609 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1610 {
1611         float *end = dst + size*4;
1612         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1613         if (stride == sizeof(unsigned char[4]))
1614         {
1615                 float *end4 = dst + (size&~3)*4;
1616                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1617                 {
1618                         while (dst < end4)
1619                         {
1620                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1621                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1622                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1623                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1624                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1625                                 dst += 16;
1626                                 src += 4*sizeof(unsigned char[4]);
1627                         }
1628                 }
1629                 else
1630                 {
1631                         while (dst < end4)
1632                         {
1633                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1634                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1635                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1636                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1637                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1638                                 dst += 16;
1639                                 src += 4*sizeof(unsigned char[4]);
1640                         }
1641                 }
1642         }
1643         while (dst < end)
1644         {
1645                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1646                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1647                 dst += 4;
1648                 src += stride;
1649         }
1650 }
1651
1652 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1653 {
1654         float *end = dst + 4*size;
1655         __m128 v = _mm_loadu_ps(src);
1656         while (dst < end)
1657         {
1658                 _mm_store_ps(dst, v);
1659                 dst += 4;
1660         }
1661 }
1662 #endif
1663
1664 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1665 {
1666 #ifdef SSE_POSSIBLE
1667         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1668         __m128 m0, m1, m2, m3;
1669         float *end;
1670         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1671         {
1672                 // fast case for identity matrix
1673                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1674                 return;
1675         }
1676         end = out4f + numitems*4;
1677         m0 = _mm_loadu_ps(inmatrix16f);
1678         m1 = _mm_loadu_ps(inmatrix16f + 4);
1679         m2 = _mm_loadu_ps(inmatrix16f + 8);
1680         m3 = _mm_loadu_ps(inmatrix16f + 12);
1681         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1682         {
1683                 while (out4f < end)
1684                 {
1685                         __m128 v = _mm_loadu_ps(in4f);
1686                         _mm_store_ps(out4f,
1687                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1688                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1689                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1690                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1691                         out4f += 4;
1692                         in4f += 4;
1693                 }
1694         }
1695         else
1696         {
1697                 while (out4f < end)
1698                 {
1699                         __m128 v = _mm_load_ps(in4f);
1700                         _mm_store_ps(out4f,
1701                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1702                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1703                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1704                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1705                         out4f += 4;
1706                         in4f += 4;
1707                 }
1708         }
1709 #endif
1710 }
1711
1712 #if 0
1713 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1714 {
1715         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1716 }
1717 #endif
1718
1719 #ifdef SSE_POSSIBLE
1720 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1721 { \
1722         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1723         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1724         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1725         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1726 }
1727
1728 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1729 { \
1730         __m128 p = (in); \
1731         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1732                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1733                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1734                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1735 }
1736
1737 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1738 {
1739         int clipmask = 0xFF;
1740         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1741         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1742         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1743         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1744         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1745         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1746         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1747         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1748         #define BBFRONT(k, pos) \
1749         { \
1750                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1751                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1752                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1753                 { \
1754                         __m128 proj; \
1755                         clipmask &= ~(1<<k); \
1756                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1757                         minproj = _mm_min_ss(minproj, proj); \
1758                         maxproj = _mm_max_ss(maxproj, proj); \
1759                 } \
1760         }
1761         BBFRONT(0, minpos); 
1762         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1763         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1764         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1765         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1766         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1767         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1768         BBFRONT(7, maxpos);
1769         #define BBCLIP(k) \
1770         { \
1771                 if (clipmask&(1<<k)) \
1772                 { \
1773                         if (!(clipmask&(1<<(k^1)))) \
1774                         { \
1775                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1776                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1777                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1778                                 minproj = _mm_min_ss(minproj, proj); \
1779                                 maxproj = _mm_max_ss(maxproj, proj); \
1780                         } \
1781                         if (!(clipmask&(1<<(k^2)))) \
1782                         { \
1783                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1784                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1785                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1786                                 minproj = _mm_min_ss(minproj, proj); \
1787                                 maxproj = _mm_max_ss(maxproj, proj); \
1788                         } \
1789                         if (!(clipmask&(1<<(k^4)))) \
1790                         { \
1791                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1792                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1793                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1794                                 minproj = _mm_min_ss(minproj, proj); \
1795                                 maxproj = _mm_max_ss(maxproj, proj); \
1796                         } \
1797                 } \
1798         }
1799         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1800         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1801         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1802         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1803         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1804         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1805         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1806         *starty = _mm_cvttss_si32(maxproj);
1807         *endy = _mm_cvttss_si32(minproj)+1;
1808         return clipmask;
1809 }
1810         
1811 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1812 {
1813         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1814         float *end = out4f + numitems*4;
1815         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1816         __m128 minpos, maxpos;
1817         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1818         {
1819                 minpos = maxpos = _mm_loadu_ps(in4f);
1820                 while (out4f < end)
1821                 {
1822                         __m128 v = _mm_loadu_ps(in4f);
1823                         minpos = _mm_min_ps(minpos, v);
1824                         maxpos = _mm_max_ps(maxpos, v);
1825                         _mm_store_ps(out4f, v);
1826                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1827                         _mm_store_ps(screen4f, v);
1828                         in4f += 4;
1829                         out4f += 4;
1830                         screen4f += 4;
1831                 }
1832         }
1833         else
1834         {
1835                 minpos = maxpos = _mm_load_ps(in4f);
1836                 while (out4f < end)
1837                 {
1838                         __m128 v = _mm_load_ps(in4f);
1839                         minpos = _mm_min_ps(minpos, v);
1840                         maxpos = _mm_max_ps(maxpos, v);
1841                         _mm_store_ps(out4f, v);
1842                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843                         _mm_store_ps(screen4f, v);
1844                         in4f += 4;
1845                         out4f += 4;
1846                         screen4f += 4;
1847                 }
1848         }
1849         if (starty && endy) 
1850         {
1851                 ALIGN(float minposf[4]);
1852                 ALIGN(float maxposf[4]);
1853                 _mm_store_ps(minposf, minpos);
1854                 _mm_store_ps(maxposf, maxpos);
1855                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1856         }
1857         return 0;
1858 }
1859
1860 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1861 {
1862         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1863         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1864         float *end;
1865         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1866                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1867         end = out4f + numitems*4;
1868         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1869         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1870         m0 = _mm_loadu_ps(inmatrix16f);
1871         m1 = _mm_loadu_ps(inmatrix16f + 4);
1872         m2 = _mm_loadu_ps(inmatrix16f + 8);
1873         m3 = _mm_loadu_ps(inmatrix16f + 12);
1874         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1875         {
1876                 minpos = maxpos = _mm_loadu_ps(in4f);
1877                 while (out4f < end)
1878                 {
1879                         __m128 v = _mm_loadu_ps(in4f);
1880                         minpos = _mm_min_ps(minpos, v);
1881                         maxpos = _mm_max_ps(maxpos, v);
1882                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1883                         _mm_store_ps(out4f, v);
1884                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1885                         _mm_store_ps(screen4f, v);
1886                         in4f += 4;
1887                         out4f += 4;
1888                         screen4f += 4;
1889                 }
1890         }
1891         else
1892         {
1893                 minpos = maxpos = _mm_load_ps(in4f);
1894                 while (out4f < end)
1895                 {
1896                         __m128 v = _mm_load_ps(in4f);
1897                         minpos = _mm_min_ps(minpos, v);
1898                         maxpos = _mm_max_ps(maxpos, v);
1899                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1900                         _mm_store_ps(out4f, v);
1901                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1902                         _mm_store_ps(screen4f, v);
1903                         in4f += 4;
1904                         out4f += 4;
1905                         screen4f += 4;
1906                 }
1907         }
1908         if (starty && endy) 
1909         {
1910                 ALIGN(float minposf[4]);
1911                 ALIGN(float maxposf[4]);
1912                 _mm_store_ps(minposf, minpos);
1913                 _mm_store_ps(maxposf, maxpos);
1914                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1915         }
1916         return 0;
1917 }
1918 #endif
1919
1920 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1921 {
1922 #ifdef SSE_POSSIBLE
1923         float *outf = dpsoftrast.post_array4f[outarray];
1924         const unsigned char *inb;
1925         int firstvertex = dpsoftrast.firstvertex;
1926         int numvertices = dpsoftrast.numvertices;
1927         int stride;
1928         switch(inarray)
1929         {
1930         case DPSOFTRAST_ARRAY_POSITION:
1931                 stride = dpsoftrast.stride_vertex;
1932                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1933                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1934                 break;
1935         case DPSOFTRAST_ARRAY_COLOR:
1936                 stride = dpsoftrast.stride_color;
1937                 if (dpsoftrast.pointer_color4f)
1938                 {
1939                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1940                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1941                 }
1942                 else if (dpsoftrast.pointer_color4ub)
1943                 {
1944                         stride = dpsoftrast.stride_color;
1945                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1946                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1947                 }
1948                 else
1949                 {
1950                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1951                 }
1952                 break;
1953         default:
1954                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1955                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1956                 {
1957                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1958                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1959                         {
1960                         case 2:
1961                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1962                                 break;
1963                         case 3:
1964                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1965                                 break;
1966                         case 4:
1967                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1968                                 break;
1969                         }
1970                 }
1971                 break;
1972         }
1973         return outf;
1974 #else
1975         return NULL;
1976 #endif
1977 }
1978
1979 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1980 {
1981         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1982         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1983         return data;
1984 }
1985
1986 #if 0
1987 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1988 {
1989 #ifdef SSE_POSSIBLE
1990         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1991         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1992         return data;
1993 #else
1994         return NULL;
1995 #endif
1996 }
1997 #endif
1998
1999 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2000 {
2001 #ifdef SSE_POSSIBLE
2002         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2003         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2004         return data;
2005 #else
2006         return NULL;
2007 #endif
2008 }
2009
2010 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2011 {
2012         int x;
2013         int startx = span->startx;
2014         int endx = span->endx;
2015         float wslope = triangle->w[0];
2016         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2017         float endz = 1.0f / (w + wslope * startx);
2018         if (triangle->w[0] == 0)
2019         {
2020                 // LordHavoc: fast flat polygons (HUD/menu)
2021                 for (x = startx;x < endx;x++)
2022                         zf[x] = endz;
2023                 return;
2024         }
2025         for (x = startx;x < endx;)
2026         {
2027                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2028                 float z = endz, dz;
2029                 if (nextsub >= endx) nextsub = endsub = endx-1;
2030                 endz = 1.0f / (w + wslope * nextsub);
2031                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2032                 for (; x <= endsub; x++, z += dz)
2033                         zf[x] = z;
2034         }
2035 }
2036
2037 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2038 {
2039 #ifdef SSE_POSSIBLE
2040         int x;
2041         int startx = span->startx;
2042         int endx = span->endx;
2043         int maskx;
2044         int subx;
2045         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2046         unsigned char * RESTRICT pixelmask = span->pixelmask;
2047         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2048         if (!pixeli)
2049                 return;
2050         pixeli += span->y * dpsoftrast.fb_width + span->x;
2051         // handle alphatest now (this affects depth writes too)
2052         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2053                 for (x = startx;x < endx;x++)
2054                         if (in4ub[x*4+3] < 128)
2055                                 pixelmask[x] = false;
2056         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2057         // helps sprites, text and hud artwork
2058         switch(thread->fb_blendmode)
2059         {
2060         case DPSOFTRAST_BLENDMODE_ALPHA:
2061         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2062         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2063                 maskx = startx;
2064                 for (x = startx;x < endx;x++)
2065                 {
2066                         if (in4ub[x*4+3] >= 1)
2067                         {
2068                                 startx = x;
2069                                 for (;;)
2070                                 {
2071                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2072                                         maskx = x;
2073                                         if (x >= endx) break;
2074                                         ++x;
2075                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2076                                         if (x >= endx) break;
2077                                 }
2078                                 break;
2079                         }
2080                 }
2081                 endx = maskx;
2082                 break;
2083         case DPSOFTRAST_BLENDMODE_OPAQUE:
2084         case DPSOFTRAST_BLENDMODE_ADD:
2085         case DPSOFTRAST_BLENDMODE_INVMOD:
2086         case DPSOFTRAST_BLENDMODE_MUL:
2087         case DPSOFTRAST_BLENDMODE_MUL2:
2088         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2089         case DPSOFTRAST_BLENDMODE_INVADD:
2090                 break;
2091         }
2092         // put some special values at the end of the mask to ensure the loops end
2093         pixelmask[endx] = 1;
2094         pixelmask[endx+1] = 0;
2095         // LordHavoc: use a double loop to identify subspans, this helps the
2096         // optimized copy/blend loops to perform at their best, most triangles
2097         // have only one run of pixels, and do the search using wide reads...
2098         x = startx;
2099         while (x < endx)
2100         {
2101                 // if this pixel is masked off, it's probably not alone...
2102                 if (!pixelmask[x])
2103                 {
2104                         x++;
2105 #if 1
2106                         if (x + 8 < endx)
2107                         {
2108                                 // the 4-item search must be aligned or else it stalls badly
2109                                 if ((x & 3) && !pixelmask[x]) 
2110                                 {
2111                                         if(pixelmask[x]) goto endmasked;
2112                                         x++;
2113                                         if (x & 3)
2114                                         {
2115                                                 if(pixelmask[x]) goto endmasked;
2116                                                 x++;
2117                                                 if (x & 3)
2118                                                 {
2119                                                         if(pixelmask[x]) goto endmasked;
2120                                                         x++;
2121                                                 }
2122                                         }
2123                                 }
2124                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2125                                         x += 4;
2126                         }
2127 #endif
2128                         for (;!pixelmask[x];x++)
2129                                 ;
2130                         // rather than continue the loop, just check the end variable
2131                         if (x >= endx)
2132                                 break;
2133                 }
2134         endmasked:
2135                 // find length of subspan
2136                 subx = x + 1;
2137 #if 1
2138                 if (subx + 8 < endx)
2139                 {
2140                         if (subx & 3)
2141                         {
2142                                 if(!pixelmask[subx]) goto endunmasked;
2143                                 subx++;
2144                                 if (subx & 3)
2145                                 {
2146                                         if(!pixelmask[subx]) goto endunmasked;
2147                                         subx++;
2148                                         if (subx & 3)
2149                                         {
2150                                                 if(!pixelmask[subx]) goto endunmasked;
2151                                                 subx++;
2152                                         }
2153                                 }
2154                         }
2155                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2156                                 subx += 4;
2157                 }
2158 #endif
2159                 for (;pixelmask[subx];subx++)
2160                         ;
2161                 // the checks can overshoot, so make sure to clip it...
2162                 if (subx > endx)
2163                         subx = endx;
2164         endunmasked:
2165                 // now that we know the subspan length...  process!
2166                 switch(thread->fb_blendmode)
2167                 {
2168                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2169 #if 0
2170                         if (subx - x >= 16)
2171                         {
2172                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2173                                 x = subx;
2174                         }
2175                         else
2176 #elif 1
2177                         while (x + 16 <= subx)
2178                         {
2179                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2180                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2181                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2182                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2183                                 x += 16;
2184                         }
2185 #endif
2186                         {
2187                                 while (x + 4 <= subx)
2188                                 {
2189                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2190                                         x += 4;
2191                                 }
2192                                 if (x + 2 <= subx)
2193                                 {
2194                                         pixeli[x] = ini[x];
2195                                         pixeli[x+1] = ini[x+1];
2196                                         x += 2;
2197                                 }
2198                                 if (x < subx)
2199                                 {
2200                                         pixeli[x] = ini[x];
2201                                         x++;
2202                                 }
2203                         }
2204                         break;
2205                 case DPSOFTRAST_BLENDMODE_ALPHA:
2206                 #define FINISHBLEND(blend2, blend1) \
2207                         for (;x + 1 < subx;x += 2) \
2208                         { \
2209                                 __m128i src, dst; \
2210                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2211                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2212                                 blend2; \
2213                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2214                         } \
2215                         if (x < subx) \
2216                         { \
2217                                 __m128i src, dst; \
2218                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2219                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2220                                 blend1; \
2221                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2222                                 x++; \
2223                         }
2224                         FINISHBLEND({
2225                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2227                         }, {
2228                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230                         });
2231                         break;
2232                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2233                         FINISHBLEND({
2234                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2236                         }, {
2237                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239                         });
2240                         break;
2241                 case DPSOFTRAST_BLENDMODE_ADD:
2242                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2243                         break;
2244                 case DPSOFTRAST_BLENDMODE_INVMOD:
2245                         FINISHBLEND({
2246                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2247                         }, {
2248                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2249                         });
2250                         break;
2251                 case DPSOFTRAST_BLENDMODE_MUL:
2252                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2253                         break;
2254                 case DPSOFTRAST_BLENDMODE_MUL2:
2255                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2256                         break;
2257                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2258                         FINISHBLEND({
2259                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2260                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2261                         }, {
2262                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2263                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264                         });
2265                         break;
2266                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2267                         FINISHBLEND({
2268                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2270                         }, {
2271                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273                         });
2274                         break;
2275                 case DPSOFTRAST_BLENDMODE_INVADD:
2276                         FINISHBLEND({
2277                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2278                         }, {
2279                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2280                         });
2281                         break;
2282                 }
2283         }
2284 #endif
2285 }
2286
2287 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2288         // warning: this is SLOW, only use if the optimized per-span functions won't do
2289 {
2290         const unsigned char * RESTRICT pixelbase;
2291         const unsigned char * RESTRICT pixel[4];
2292         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2293         int wrapmask[2] = { width-1, height-1 };
2294         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2295         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2296         {
2297                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2298                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2299                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2300                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2301                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2302                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2303                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2304                 {
2305                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2306                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2307                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2308                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2309                 }
2310                 else
2311                 {
2312                         tci[0] &= wrapmask[0];
2313                         tci[1] &= wrapmask[1];
2314                         tci1[0] &= wrapmask[0];
2315                         tci1[1] &= wrapmask[1];
2316                 }
2317                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2318                 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2319                 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2320                 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2321                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2322                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2323                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2324                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2325         }
2326         else
2327         {
2328                 int tci[2] = { x * width, y * height };
2329                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2330                 {
2331                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2332                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2333                 }
2334                 else
2335                 {
2336                         tci[0] &= wrapmask[0];
2337                         tci[1] &= wrapmask[1];
2338                 }
2339                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2340                 c[0] = pixel[0][0];
2341                 c[1] = pixel[0][1];
2342                 c[2] = pixel[0][2];
2343                 c[3] = pixel[0][3];
2344         }
2345 }
2346
2347 #if 0
2348 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2349 {
2350         int x;
2351         int startx = span->startx;
2352         int endx = span->endx;
2353         int flags;
2354         float c[4];
2355         float data[4];
2356         float slope[4];
2357         float tc[2], endtc[2];
2358         float tcscale[2];
2359         unsigned int tci[2];
2360         unsigned int tci1[2];
2361         unsigned int tcimin[2];
2362         unsigned int tcimax[2];
2363         int tciwrapmask[2];
2364         int tciwidth;
2365         int filter;
2366         int mip;
2367         const unsigned char * RESTRICT pixelbase;
2368         const unsigned char * RESTRICT pixel[4];
2369         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2370         // if no texture is bound, just fill it with white
2371         if (!texture)
2372         {
2373                 for (x = startx;x < endx;x++)
2374                 {
2375                         out4f[x*4+0] = 1.0f;
2376                         out4f[x*4+1] = 1.0f;
2377                         out4f[x*4+2] = 1.0f;
2378                         out4f[x*4+3] = 1.0f;
2379                 }
2380                 return;
2381         }
2382         mip = triangle->mip[texunitindex];
2383         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2384         // if this mipmap of the texture is 1 pixel, just fill it with that color
2385         if (texture->mipmap[mip][1] == 4)
2386         {
2387                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2388                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2389                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2390                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2391                 for (x = startx;x < endx;x++)
2392                 {
2393                         out4f[x*4+0] = c[0];
2394                         out4f[x*4+1] = c[1];
2395                         out4f[x*4+2] = c[2];
2396                         out4f[x*4+3] = c[3];
2397                 }
2398                 return;
2399         }
2400         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2401         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2402         flags = texture->flags;
2403         tcscale[0] = texture->mipmap[mip][2];
2404         tcscale[1] = texture->mipmap[mip][3];
2405         tciwidth = -texture->mipmap[mip][2];
2406         tcimin[0] = 0;
2407         tcimin[1] = 0;
2408         tcimax[0] = texture->mipmap[mip][2]-1;
2409         tcimax[1] = texture->mipmap[mip][3]-1;
2410         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2411         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2412         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2413         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2414         if (filter)
2415         {
2416                 endtc[0] -= 0.5f;
2417                 endtc[1] -= 0.5f;
2418         }
2419         for (x = startx;x < endx;)
2420         {
2421                 unsigned int subtc[2];
2422                 unsigned int substep[2];
2423                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2424                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2425                 if (nextsub >= endx)
2426                 {
2427                         nextsub = endsub = endx-1;      
2428                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2429                 }
2430                 tc[0] = endtc[0];
2431                 tc[1] = endtc[1];
2432                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2433                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2434                 if (filter)
2435                 {
2436                         endtc[0] -= 0.5f;
2437                         endtc[1] -= 0.5f;
2438                 }
2439                 substep[0] = (endtc[0] - tc[0]) * subscale;
2440                 substep[1] = (endtc[1] - tc[1]) * subscale;
2441                 subtc[0] = tc[0] * (1<<12);
2442                 subtc[1] = tc[1] * (1<<12);
2443                 if (filter)
2444                 {
2445                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2446                         {
2447                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2448                                 {
2449                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2450                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2451                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2452                                         tci[0] = subtc[0]>>12;
2453                                         tci[1] = subtc[1]>>12;
2454                                         tci1[0] = tci[0] + 1;
2455                                         tci1[1] = tci[1] + 1;
2456                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2457                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2458                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2459                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2460                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2461                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2462                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2463                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2464                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2465                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2466                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2467                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2468                                         out4f[x*4+0] = c[0];
2469                                         out4f[x*4+1] = c[1];
2470                                         out4f[x*4+2] = c[2];
2471                                         out4f[x*4+3] = c[3];
2472                                 }
2473                         }
2474                         else
2475                         {
2476                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2477                                 {
2478                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2479                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2480                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2481                                         tci[0] = subtc[0]>>12;
2482                                         tci[1] = subtc[1]>>12;
2483                                         tci1[0] = tci[0] + 1;
2484                                         tci1[1] = tci[1] + 1;
2485                                         tci[0] &= tciwrapmask[0];
2486                                         tci[1] &= tciwrapmask[1];
2487                                         tci1[0] &= tciwrapmask[0];
2488                                         tci1[1] &= tciwrapmask[1];
2489                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2490                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2491                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2492                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2493                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2494                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2495                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2496                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2497                                         out4f[x*4+0] = c[0];
2498                                         out4f[x*4+1] = c[1];
2499                                         out4f[x*4+2] = c[2];
2500                                         out4f[x*4+3] = c[3];
2501                                 }
2502                         }
2503                 }
2504                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2505                 {
2506                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2507                         {
2508                                 tci[0] = subtc[0]>>12;
2509                                 tci[1] = subtc[1]>>12;
2510                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2511                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2512                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2513                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2514                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2515                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2516                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2517                                 out4f[x*4+0] = c[0];
2518                                 out4f[x*4+1] = c[1];
2519                                 out4f[x*4+2] = c[2];
2520                                 out4f[x*4+3] = c[3];
2521                         }
2522                 }
2523                 else
2524                 {
2525                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2526                         {
2527                                 tci[0] = subtc[0]>>12;
2528                                 tci[1] = subtc[1]>>12;
2529                                 tci[0] &= tciwrapmask[0];
2530                                 tci[1] &= tciwrapmask[1];
2531                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2532                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2533                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2534                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2535                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2536                                 out4f[x*4+0] = c[0];
2537                                 out4f[x*4+1] = c[1];
2538                                 out4f[x*4+2] = c[2];
2539                                 out4f[x*4+3] = c[3];
2540                         }
2541                 }
2542         }
2543 }
2544 #endif
2545
2546 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2547 {
2548 #ifdef SSE_POSSIBLE
2549         int x;
2550         int startx = span->startx;
2551         int endx = span->endx;
2552         int flags;
2553         __m128 data, slope, tcscale;
2554         __m128i tcsize, tcmask, tcoffset, tcmax;
2555         __m128 tc, endtc;
2556         __m128i subtc, substep, endsubtc;
2557         int filter;
2558         int mip;
2559         int affine; // LordHavoc: optimized affine texturing case
2560         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2561         const unsigned char * RESTRICT pixelbase;
2562         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2563         // if no texture is bound, just fill it with white
2564         if (!texture)
2565         {
2566                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2567                 return;
2568         }
2569         mip = triangle->mip[texunitindex];
2570         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2571         // if this mipmap of the texture is 1 pixel, just fill it with that color
2572         if (texture->mipmap[mip][1] == 4)
2573         {
2574                 unsigned int k = *((const unsigned int *)pixelbase);
2575                 for (x = startx;x < endx;x++)
2576                         outi[x] = k;
2577                 return;
2578         }
2579         affine = zf[startx] == zf[endx-1];
2580         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2581         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2582         flags = texture->flags;
2583         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2584         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2585         tcscale = _mm_cvtepi32_ps(tcsize);
2586         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2587         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2588         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2589         if (filter)
2590                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2591         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2592         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2593         tcmax = _mm_packs_epi32(tcmask, tcmask);
2594         for (x = startx;x < endx;)
2595         {
2596                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2597                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2598                 if (nextsub >= endx || affine)
2599                 {
2600                         nextsub = endsub = endx-1;
2601                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2602                 }       
2603                 tc = endtc;
2604                 subtc = endsubtc;
2605                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2606                 if (filter)
2607                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2608                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2609                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2610                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2611                 substep = _mm_slli_epi32(substep, 1);
2612                 if (filter)
2613                 {
2614                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2615                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2616                         {
2617                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2618                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2619                                 {
2620                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2621                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2622                                         tci = _mm_madd_epi16(tci, tcoffset);
2623                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2624                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2625                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2626                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2627                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2628                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2629                                         fracm = _mm_srli_epi16(subtc, 1);
2630                                         pix1 = _mm_add_epi16(pix1,
2631                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2632                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2633                                         pix3 = _mm_add_epi16(pix3,
2634                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2635                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2636                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2637                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2638                                         pix2 = _mm_add_epi16(pix2,
2639                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2640                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2641                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2642                                 }
2643                                 if (x <= endsub)
2644                                 {
2645                                         const unsigned char * RESTRICT ptr1;
2646                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2647                                         tci = _mm_madd_epi16(tci, tcoffset);
2648                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2649                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2650                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2651                                         fracm = _mm_srli_epi16(subtc, 1);
2652                                         pix1 = _mm_add_epi16(pix1,
2653                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2654                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2655                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2656                                         pix1 = _mm_add_epi16(pix1,
2657                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2658                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2659                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2660                                         x++;
2661                                 }
2662                         }
2663                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2664                         {
2665                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2666                                 {
2667                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2668                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2669                                         tci = _mm_madd_epi16(tci, tcoffset);
2670                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2671                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2672                                                                                         _mm_setzero_si128());
2673                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2674                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2675                                                                                         _mm_setzero_si128());
2676                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2677                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2678                                         tci = _mm_madd_epi16(tci, tcoffset);
2679                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2680                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2681                                                                                         _mm_setzero_si128());
2682                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2683                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2684                                                                                         _mm_setzero_si128());
2685                                         fracm = _mm_srli_epi16(subtc, 1);
2686                                         pix1 = _mm_add_epi16(pix1,
2687                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2688                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2689                                         pix3 = _mm_add_epi16(pix3,
2690                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2691                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2692                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2693                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2694                                         pix2 = _mm_add_epi16(pix2,
2695                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2696                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2697                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2698                                 }
2699                                 if (x <= endsub)
2700                                 {
2701                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2702                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2703                                         tci = _mm_madd_epi16(tci, tcoffset);
2704                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2705                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2706                                                                                         _mm_setzero_si128());
2707                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2708                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2709                                                                                         _mm_setzero_si128());
2710                                         fracm = _mm_srli_epi16(subtc, 1);
2711                                         pix1 = _mm_add_epi16(pix1,
2712                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2713                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2714                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2715                                         pix1 = _mm_add_epi16(pix1,
2716                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2717                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2718                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2719                                         x++;
2720                                 }
2721                         }
2722                         else
2723                         {
2724                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2725                                 {
2726                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2727                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2728                                         tci = _mm_madd_epi16(tci, tcoffset);
2729                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2730                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2731                                                                                         _mm_setzero_si128());
2732                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2733                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2734                                                                                         _mm_setzero_si128());
2735                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2736                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2737                                         tci = _mm_madd_epi16(tci, tcoffset);
2738                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2739                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2740                                                                                         _mm_setzero_si128());
2741                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2742                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2743                                                                                         _mm_setzero_si128());
2744                                         fracm = _mm_srli_epi16(subtc, 1);
2745                                         pix1 = _mm_add_epi16(pix1,
2746                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2747                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2748                                         pix3 = _mm_add_epi16(pix3,
2749                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2750                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2751                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2752                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2753                                         pix2 = _mm_add_epi16(pix2,
2754                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2755                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2756                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2757                                 }
2758                                 if (x <= endsub)
2759                                 {
2760                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2761                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2762                                         tci = _mm_madd_epi16(tci, tcoffset);
2763                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2764                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2765                                                                                         _mm_setzero_si128());
2766                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2767                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2768                                                                                         _mm_setzero_si128());
2769                                         fracm = _mm_srli_epi16(subtc, 1);
2770                                         pix1 = _mm_add_epi16(pix1,
2771                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2772                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2773                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2774                                         pix1 = _mm_add_epi16(pix1,
2775                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2776                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2777                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2778                                         x++;
2779                                 }
2780                         }
2781                 }
2782                 else
2783                 {
2784                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2785                         {
2786                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2787                                 {
2788                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2789                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2790                                         tci = _mm_madd_epi16(tci, tcoffset);
2791                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2792                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2793                                 }
2794                                 if (x <= endsub)
2795                                 {
2796                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2797                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2798                                         tci = _mm_madd_epi16(tci, tcoffset);
2799                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2800                                         x++;
2801                                 }
2802                         }
2803                         else
2804                         {
2805                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2806                                 {
2807                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2808                                         tci = _mm_and_si128(tci, tcmax); 
2809                                         tci = _mm_madd_epi16(tci, tcoffset);
2810                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2811                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2812                                 }
2813                                 if (x <= endsub)
2814                                 {
2815                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2816                                         tci = _mm_and_si128(tci, tcmax); 
2817                                         tci = _mm_madd_epi16(tci, tcoffset);
2818                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2819                                         x++;
2820                                 }
2821                         }
2822                 }
2823         }
2824 #endif
2825 }
2826
2827 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2828 {
2829         // TODO: IMPLEMENT
2830         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2831 }
2832
2833 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2834 {
2835         // TODO: IMPLEMENT
2836         return 1.0f;
2837 }
2838
2839 #if 0
2840 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2841 {
2842         int x;
2843         int startx = span->startx;
2844         int endx = span->endx;
2845         float c[4];
2846         float data[4];
2847         float slope[4];
2848         float z;
2849         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2850         for (x = startx;x < endx;x++)
2851         {
2852                 z = zf[x];
2853                 c[0] = (data[0] + slope[0]*x) * z;
2854                 c[1] = (data[1] + slope[1]*x) * z;
2855                 c[2] = (data[2] + slope[2]*x) * z;
2856                 c[3] = (data[3] + slope[3]*x) * z;
2857                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2858                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2859                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2860                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2861         }
2862 }
2863 #endif
2864
2865 #if 0
2866 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2867 {
2868         int x;
2869         int startx = span->startx;
2870         int endx = span->endx;
2871         float c[4];
2872         float data[4];
2873         float slope[4];
2874         float z;
2875         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2876         for (x = startx;x < endx;x++)
2877         {
2878                 z = zf[x];
2879                 c[0] = (data[0] + slope[0]*x) * z;
2880                 c[1] = (data[1] + slope[1]*x) * z;
2881                 c[2] = (data[2] + slope[2]*x) * z;
2882                 c[3] = (data[3] + slope[3]*x) * z;
2883                 out4f[x*4+0] = c[0];
2884                 out4f[x*4+1] = c[1];
2885                 out4f[x*4+2] = c[2];
2886                 out4f[x*4+3] = c[3];
2887         }
2888 }
2889 #endif
2890
2891 #if 0
2892 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2893 {
2894         int x, startx = span->startx, endx = span->endx;
2895         float c[4], localcolor[4];
2896         localcolor[0] = subcolor[0];
2897         localcolor[1] = subcolor[1];
2898         localcolor[2] = subcolor[2];
2899         localcolor[3] = subcolor[3];
2900         for (x = startx;x < endx;x++)
2901         {
2902                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2903                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2904                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2905                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2906                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2907                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2908                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2909                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2910         }
2911 }
2912 #endif
2913
2914 #if 0
2915 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2916 {
2917         int x, startx = span->startx, endx = span->endx;
2918         for (x = startx;x < endx;x++)
2919         {
2920                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2921                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2922                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2923                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2924         }
2925 }
2926 #endif
2927
2928 #if 0
2929 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2930 {
2931         int x, startx = span->startx, endx = span->endx;
2932         for (x = startx;x < endx;x++)
2933         {
2934                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2935                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2936                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2937                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2938         }
2939 }
2940 #endif
2941
2942 #if 0
2943 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2944 {
2945         int x, startx = span->startx, endx = span->endx;
2946         float a, b;
2947         for (x = startx;x < endx;x++)
2948         {
2949                 a = 1.0f - inb4f[x*4+3];
2950                 b = inb4f[x*4+3];
2951                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2952                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2953                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2954                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2955         }
2956 }
2957 #endif
2958
2959 #if 0
2960 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2961 {
2962         int x, startx = span->startx, endx = span->endx;
2963         float localcolor[4], ilerp, lerp;
2964         localcolor[0] = color[0];
2965         localcolor[1] = color[1];
2966         localcolor[2] = color[2];
2967         localcolor[3] = color[3];
2968         ilerp = 1.0f - localcolor[3];
2969         lerp = localcolor[3];
2970         for (x = startx;x < endx;x++)
2971         {
2972                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2973                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2974                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2975                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2976         }
2977 }
2978 #endif
2979
2980
2981
2982 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2983 {
2984 #ifdef SSE_POSSIBLE
2985         int x;
2986         int startx = span->startx;
2987         int endx = span->endx;
2988         __m128 data, slope;
2989         __m128 mod, endmod;
2990         __m128i submod, substep, endsubmod;
2991         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2992         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2993         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2994         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2995         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2996         for (x = startx; x < endx;)
2997         {
2998                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2999                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3000                 if (nextsub >= endx)
3001                 {
3002                         nextsub = endsub = endx-1;
3003                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3004                 }
3005                 mod = endmod;
3006                 submod = endsubmod;
3007                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3008                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3009                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3010                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3011                 substep = _mm_packs_epi32(substep, substep);
3012                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3013                 {
3014                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3015                         pix = _mm_mulhi_epu16(pix, submod);
3016                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3017                 }
3018                 if (x <= endsub)
3019                 {
3020                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3021                         pix = _mm_mulhi_epu16(pix, submod);
3022                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3023                         x++;
3024                 }
3025         }
3026 #endif
3027 }
3028
3029 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3030 {
3031 #ifdef SSE_POSSIBLE
3032         int x;
3033         int startx = span->startx;
3034         int endx = span->endx;
3035         __m128 data, slope;
3036         __m128 mod, endmod;
3037         __m128i submod, substep, endsubmod;
3038         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3039         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3040         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3041         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3042         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3043         for (x = startx; x < endx;)
3044         {
3045                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3046                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3047                 if (nextsub >= endx)
3048                 {
3049                         nextsub = endsub = endx-1;
3050                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3051                 }
3052                 mod = endmod;
3053                 submod = endsubmod;
3054                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3055                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3056                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3057                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3058                 substep = _mm_packs_epi32(substep, substep);
3059                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3060                 {
3061                         __m128i pix = _mm_srai_epi16(submod, 4);
3062                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3063                 }
3064                 if (x <= endsub)
3065                 {
3066                         __m128i pix = _mm_srai_epi16(submod, 4);
3067                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3068                         x++;
3069                 }
3070         }
3071 #endif
3072 }
3073
3074 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3075 {
3076 #ifdef SSE_POSSIBLE
3077         int x, startx = span->startx, endx = span->endx;
3078         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3079         localcolor = _mm_packs_epi32(localcolor, localcolor);
3080         for (x = startx;x+2 <= endx;x+=2)
3081         {
3082                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3083                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3084                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3085                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3086         }
3087         if (x < endx)
3088         {
3089                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3090                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3091                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3092                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3093         }
3094 #endif
3095 }
3096
3097 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3098 {
3099 #ifdef SSE_POSSIBLE
3100         int x, startx = span->startx, endx = span->endx;
3101         for (x = startx;x+2 <= endx;x+=2)
3102         {
3103                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3104                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3105                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3106                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3107         }
3108         if (x < endx)
3109         {
3110                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3111                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3112                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3113                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3114         }
3115 #endif
3116 }
3117
3118 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3119 {
3120 #ifdef SSE_POSSIBLE
3121         int x, startx = span->startx, endx = span->endx;
3122         for (x = startx;x+2 <= endx;x+=2)
3123         {
3124                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3125                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3126                 pix1 = _mm_add_epi16(pix1, pix2);
3127                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3128         }
3129         if (x < endx)
3130         {
3131                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3132                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3133                 pix1 = _mm_add_epi16(pix1, pix2);
3134                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3135         }
3136 #endif
3137 }
3138
3139 #if 0
3140 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3141 {
3142 #ifdef SSE_POSSIBLE
3143         int x, startx = span->startx, endx = span->endx;
3144         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3145         tint = _mm_packs_epi32(tint, tint);
3146         for (x = startx;x+2 <= endx;x+=2)
3147         {
3148                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3149                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3150                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3151                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3152         }
3153         if (x < endx)
3154         {
3155                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3156                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3157                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3158                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3159         }
3160 #endif
3161 }
3162 #endif
3163
3164 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3165 {
3166 #ifdef SSE_POSSIBLE
3167         int x, startx = span->startx, endx = span->endx;
3168         for (x = startx;x+2 <= endx;x+=2)
3169         {
3170                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3171                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3172                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3173                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3174                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3175         }
3176         if (x < endx)
3177         {
3178                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3179                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3180                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3181                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3182                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3183         }
3184 #endif
3185 }
3186
3187 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3188 {
3189 #ifdef SSE_POSSIBLE
3190         int x, startx = span->startx, endx = span->endx;
3191         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3192         localcolor = _mm_packs_epi32(localcolor, localcolor);
3193         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3194         for (x = startx;x+2 <= endx;x+=2)
3195         {
3196                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3197                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3198                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3199         }
3200         if (x < endx)
3201         {
3202                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3203                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3204                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3205         }
3206 #endif
3207 }
3208
3209
3210
3211 static void DPSOFTRAST_VertexShader_Generic(void)
3212 {
3213         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3214         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3215         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3216         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3217                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3218 }
3219
3220 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3221 {
3222         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3223         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3224         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3225         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3226         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3227         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3228         {
3229                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3230                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3231                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3232                 {
3233                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3234                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3235                         {
3236                                 // multiply
3237                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3238                         }
3239                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3240                         {
3241                                 // add
3242                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3243                         }
3244                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3245                         {
3246                                 // alphablend
3247                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3248                         }
3249                 }
3250         }
3251         else
3252                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3253         if(thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
3254         {
3255                 int x;
3256                 for (x = span->startx;x < span->endx;x++)
3257                         buffer_FragColorbgra8[x*4+3] = buffer_FragColorbgra8[x*4+3] * thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3258         }
3259         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3260 }
3261
3262
3263
3264 static void DPSOFTRAST_VertexShader_PostProcess(void)
3265 {
3266         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3267         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3268         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3269 }
3270
3271 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3272 {
3273         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3274         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3275         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3276         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3277         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3278         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3279         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3280         {
3281                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3282                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3283         }
3284         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3285         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3286         {
3287                 // TODO: implement saturation
3288         }
3289         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3290         {
3291                 // TODO: implement gammaramps
3292         }
3293         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3294 }
3295
3296
3297
3298 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3299 {
3300         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3301 }
3302
3303 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3304 {
3305         // this is never called (because colormask is off when this shader is used)
3306         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3307         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3308         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3309         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3310         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3311 }
3312
3313
3314
3315 static void DPSOFTRAST_VertexShader_FlatColor(void)
3316 {
3317         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3318         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3319 }
3320
3321 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3322 {
3323 #ifdef SSE_POSSIBLE
3324         unsigned char * RESTRICT pixelmask = span->pixelmask;
3325         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3326         int x, startx = span->startx, endx = span->endx;
3327         __m128i Color_Ambientm;
3328         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3329         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3330         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3331         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3332         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3333         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3334                 pixel = buffer_FragColorbgra8;
3335         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3336         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3337         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3338         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3339         for (x = startx;x < endx;x++)
3340         {
3341                 __m128i color, pix;
3342                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3343                 {
3344                         __m128i pix2;
3345                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3346                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3347                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3348                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3349                         x += 3;
3350                         continue;
3351                 }
3352                 if (!pixelmask[x])
3353                         continue;
3354                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3355                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3356                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3357         }
3358         if (pixel == buffer_FragColorbgra8)
3359                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3360 #endif
3361 }
3362
3363
3364
3365 static void DPSOFTRAST_VertexShader_VertexColor(void)
3366 {
3367         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3368         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3369         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3370 }
3371
3372 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3373 {
3374 #ifdef SSE_POSSIBLE
3375         unsigned char * RESTRICT pixelmask = span->pixelmask;
3376         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3377         int x, startx = span->startx, endx = span->endx;
3378         __m128i Color_Ambientm, Color_Diffusem;
3379         __m128 data, slope;
3380         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3381         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3382         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3383         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3384         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3385         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3386         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3387                 pixel = buffer_FragColorbgra8;
3388         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3389         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3390         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3391         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3392         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3393         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3394         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3395         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3396         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3397         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3398         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3399         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3400         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3401         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3402         {
3403                 __m128i color, mod, pix;
3404                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3405                 {
3406                         __m128i pix2, mod2;
3407                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3408                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3409                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3410                         data = _mm_add_ps(data, slope);
3411                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3412                         data = _mm_add_ps(data, slope);
3413                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3414                         data = _mm_add_ps(data, slope);
3415                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3416                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3417                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3418                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3419                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3420                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3421                         x += 3;
3422                         continue;
3423                 }
3424                 if (!pixelmask[x])
3425                         continue;
3426                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3427                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3428                 mod = _mm_packs_epi32(mod, mod);
3429                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3430                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3431         }
3432         if (pixel == buffer_FragColorbgra8)
3433                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3434 #endif
3435 }
3436
3437
3438
3439 static void DPSOFTRAST_VertexShader_Lightmap(void)
3440 {
3441         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3442         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3443         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3444 }
3445
3446 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3447 {
3448 #ifdef SSE_POSSIBLE
3449         unsigned char * RESTRICT pixelmask = span->pixelmask;
3450         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3451         int x, startx = span->startx, endx = span->endx;
3452         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3453         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3454         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3455         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3456         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3457         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3458         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3459         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3460         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3461         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3462                 pixel = buffer_FragColorbgra8;
3463         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3464         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3465         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3466         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3467         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3468         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3469         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3470         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3471         {
3472                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3473                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3474                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3475                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3476                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3477                 for (x = startx;x < endx;x++)
3478                 {
3479                         __m128i color, lightmap, glow, pix;
3480                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3481                         {
3482                                 __m128i pix2;
3483                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3484                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3485                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3486                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3487                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3488                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3489                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3490                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3491                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3492                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3493                                 x += 3;
3494                                 continue;
3495                         }
3496                         if (!pixelmask[x])
3497                                 continue;
3498                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3499                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3500                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3501                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3502                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3503                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3504                 }
3505         }
3506         else
3507         {
3508                 for (x = startx;x < endx;x++)
3509                 {
3510                         __m128i color, lightmap, pix;
3511                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3512                         {
3513                                 __m128i pix2;
3514                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3515                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3516                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3517                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3518                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3519                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3520                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3521                                 x += 3;
3522                                 continue;
3523                         }
3524                         if (!pixelmask[x]) 
3525                                 continue;
3526                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3527                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3528                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3529                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3530                 }
3531         }
3532         if (pixel == buffer_FragColorbgra8)
3533                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3534 #endif
3535 }
3536
3537
3538 void DPSOFTRAST_VertexShader_LightDirection(void);
3539 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3540
3541 static void DPSOFTRAST_VertexShader_FakeLight(void)
3542 {
3543         DPSOFTRAST_VertexShader_LightDirection();
3544 }
3545
3546 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3547 {
3548         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3549 }
3550
3551
3552
3553 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3554 {
3555         DPSOFTRAST_VertexShader_LightDirection();
3556         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3557 }
3558
3559 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3560 {
3561         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3562 }
3563
3564
3565
3566 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3567 {
3568         DPSOFTRAST_VertexShader_LightDirection();
3569         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3570 }
3571
3572 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3573 {
3574         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3575 }
3576
3577
3578
3579 void DPSOFTRAST_VertexShader_LightDirection(void)
3580 {
3581         int i;
3582         int numvertices = dpsoftrast.numvertices;
3583         float LightDir[4];
3584         float LightVector[4];
3585         float EyePosition[4];
3586         float EyeVectorModelSpace[4];
3587         float EyeVector[4];
3588         float position[4];
3589         float svector[4];
3590         float tvector[4];
3591         float normal[4];
3592         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3593         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3594         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3595         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3596         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3597         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3598         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3599         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3600         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3601         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3602         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3603         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3604         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3605         for (i = 0;i < numvertices;i++)
3606         {
3607                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3608                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3609                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3610                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3611                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3612                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3613                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3614                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3615                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3616                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3617                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3618                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3619                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3620                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3621                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3622                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3623                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3624                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3625                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3626                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3627                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3628                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3629                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3630                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3631                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3632                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3633                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3634                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3635                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3636         }
3637         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3638 }
3639
3640 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3641 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3642 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3643 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3644 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3645 #define DPSOFTRAST_Vector3Normalize(v)\
3646 do\
3647 {\
3648         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3649         if (len)\
3650         {\
3651                 len = 1.0f / len;\
3652                 v[0] *= len;\
3653                 v[1] *= len;\
3654                 v[2] *= len;\
3655         }\
3656 }\
3657 while(0)
3658
3659 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3660 {
3661         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3662         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3663         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3664         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3665         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3666         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3667         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3668         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3669         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3670         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3671         int x, startx = span->startx, endx = span->endx;
3672         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3673         float LightVectordata[4];
3674         float LightVectorslope[4];
3675         float EyeVectordata[4];
3676         float EyeVectorslope[4];
3677         float VectorSdata[4];
3678         float VectorSslope[4];
3679         float VectorTdata[4];
3680         float VectorTslope[4];
3681         float VectorRdata[4];
3682         float VectorRslope[4];
3683         float z;
3684         float diffusetex[4];
3685         float glosstex[4];
3686         float surfacenormal[4];
3687         float lightnormal[4];
3688         float lightnormal_modelspace[4];
3689         float eyenormal[4];
3690         float specularnormal[4];
3691         float diffuse;
3692         float specular;
3693         float SpecularPower;
3694         int d[4];
3695         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3696         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3697         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3698         Color_Glow[3] = 0.0f;
3699         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3700         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3701         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3702         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3703         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3704         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3705         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3706         Color_Pants[3] = 0.0f;
3707         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3708         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3709         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3710         Color_Shirt[3] = 0.0f;
3711         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3712         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3713         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3714         {
3715                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3716                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3717         }
3718         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3719         {
3720                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3721         }
3722         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3723         {
3724                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3725                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3726                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3727                 Color_Diffuse[3] = 0.0f;
3728                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3729                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3730                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3731                 LightColor[3] = 0.0f;
3732                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3733                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3734                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3735                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3736                 Color_Specular[3] = 0.0f;
3737                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3738                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3739                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3740
3741                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3742                 {
3743                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3744                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3745                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3746                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3747                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3748                 }
3749                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3750                 {
3751                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3752                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3753                 }
3754                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3755                 {
3756                         // nothing of this needed
3757                 }
3758                 else
3759                 {
3760                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3761                 }
3762
3763                 for (x = startx;x < endx;x++)
3764                 {
3765                         z = buffer_z[x];
3766                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3767                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3768                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3769                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3770                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3771                         {
3772                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3773                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3774                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3775                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3776                         }
3777                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3778                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3779                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3780                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3781                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3782                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3783                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3784                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3785
3786                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3787                         {
3788                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3789                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3790                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3791                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3792
3793                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3794                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3795                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3796                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3797
3798                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3799                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3800                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3801                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3802
3803                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3804                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3805                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3806                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3807
3808                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3809                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3810
3811                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3812                                 {
3813                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3814                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3815                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3816                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3817                                 }
3818                         }
3819                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3820                         {
3821                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3822                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3823                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3824                                 {
3825                                         float f = 1.0f / 256.0f;
3826                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3827                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3828                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3829                                 }
3830                         }
3831                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3832                         {
3833                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3834                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3835                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3836                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3837
3838                                 LightColor[0] = 1.0;
3839                                 LightColor[1] = 1.0;
3840                                 LightColor[2] = 1.0;
3841                         }
3842                         else
3843                         {
3844                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3845                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3846                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3847                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3848                         }
3849
3850                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3851
3852                         if(thread->shader_exactspecularmath)
3853                         {
3854                                 // reflect lightnormal at surfacenormal, take the negative of that
3855                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3856                                 float f;
3857                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3858                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3859                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3860                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3861
3862                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3863                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3864                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3865                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3866                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3867
3868                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3869                         }
3870                         else
3871                         {
3872                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3873                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3874                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3875                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3876
3877                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3878                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3879                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3880                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3881
3882                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3883                         }
3884                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3885
3886                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3887                         {
3888                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3889                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3890                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3891                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3892                         }
3893                         else
3894                         {
3895                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3896                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3897                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3898                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3899                         }
3900
3901                         buffer_FragColorbgra8[x*4+0] = d[0];
3902                         buffer_FragColorbgra8[x*4+1] = d[1];
3903                         buffer_FragColorbgra8[x*4+2] = d[2];
3904                         buffer_FragColorbgra8[x*4+3] = d[3];
3905                 }
3906         }
3907         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3908         {
3909                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3910                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3911                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3912                 Color_Diffuse[3] = 0.0f;
3913                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3914                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3915                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3916                 LightColor[3] = 0.0f;
3917                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3918
3919                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3920                 {
3921                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3922                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3923                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3924                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3925                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3926                 }
3927                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3928                 {
3929                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3930                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3931                 }
3932                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3933                 {
3934                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3935                 }
3936                 else
3937                 {
3938                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3939                 }
3940
3941                 for (x = startx;x < endx;x++)
3942                 {
3943                         z = buffer_z[x];
3944                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3945                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3946                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3947                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3948                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3949                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3950                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3951                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3952
3953                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3954                         {
3955                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3956                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3957                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3958                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3959
3960                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3961                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3962                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3963                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3964
3965                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3966                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3967                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3968                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3969
3970                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3971                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3972                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3973                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3974
3975                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3976                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3977
3978                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3979                                 {
3980                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3981                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3982                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3983                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3984                                 }
3985                         }
3986                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3987                         {
3988                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3989                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3990                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3991                                 {
3992                                         float f = 1.0f / 256.0f;
3993                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3994                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3995                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3996                                 }
3997                         }
3998                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3999                         {
4000                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4001                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4002                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4003                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4004
4005                                 LightColor[0] = 1.0;
4006                                 LightColor[1] = 1.0;
4007                                 LightColor[2] = 1.0;
4008                         }
4009                         else
4010                         {
4011                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4012                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4013                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4014                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4015                         }
4016
4017                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4018                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4019                         {
4020                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4021                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4022                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4023                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4024                         }
4025                         else
4026                         {
4027                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4028                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4029                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4030                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4031                         }
4032                         buffer_FragColorbgra8[x*4+0] = d[0];
4033                         buffer_FragColorbgra8[x*4+1] = d[1];
4034                         buffer_FragColorbgra8[x*4+2] = d[2];
4035                         buffer_FragColorbgra8[x*4+3] = d[3];
4036                 }
4037         }
4038         else
4039         {
4040                 for (x = startx;x < endx;x++)
4041                 {
4042                         // z = buffer_z[x];
4043                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4044                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4045                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4046                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4047
4048                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4049                         {
4050                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4051                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4052                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4053                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4054                         }
4055                         else
4056                         {
4057                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4058                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4059                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4060                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4061                         }
4062                         buffer_FragColorbgra8[x*4+0] = d[0];
4063                         buffer_FragColorbgra8[x*4+1] = d[1];
4064                         buffer_FragColorbgra8[x*4+2] = d[2];
4065                         buffer_FragColorbgra8[x*4+3] = d[3];
4066                 }
4067         }
4068         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4069 }
4070
4071
4072
4073 static void DPSOFTRAST_VertexShader_LightSource(void)
4074 {
4075         int i;
4076         int numvertices = dpsoftrast.numvertices;
4077         float LightPosition[4];
4078         float LightVector[4];
4079         float LightVectorModelSpace[4];
4080         float EyePosition[4];
4081         float EyeVectorModelSpace[4];
4082         float EyeVector[4];
4083         float position[4];
4084         float svector[4];
4085         float tvector[4];
4086         float normal[4];
4087         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4088         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4089         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4090         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4091         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4092         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4093         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4094         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4095         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4096         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4097         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4098         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4099         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4100         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4101         for (i = 0;i < numvertices;i++)
4102         {
4103                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4104                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4105                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4106                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4107                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4108                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4109                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4110                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4111                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4112                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4113                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4114                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4115                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4116                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4117                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4118                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4119                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4120                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4121                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4122                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4123                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4124                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4125                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4126                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4127                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4128                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4129                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4130                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4131                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4132                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4133                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4134                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4135         }
4136         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4137         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4138 }
4139
4140 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4141 {
4142 #ifdef SSE_POSSIBLE
4143         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4144         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4145         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4146         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4147         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4148         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4149         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4150         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4151         int x, startx = span->startx, endx = span->endx;
4152         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], /*Color_Glow[4],*/ Color_Pants[4], Color_Shirt[4], LightColor[4];
4153         float CubeVectordata[4];
4154         float CubeVectorslope[4];
4155         float LightVectordata[4];
4156         float LightVectorslope[4];
4157         float EyeVectordata[4];
4158         float EyeVectorslope[4];
4159         float z;
4160         float diffusetex[4];
4161         float glosstex[4];
4162         float surfacenormal[4];
4163         float lightnormal[4];
4164         float eyenormal[4];
4165         float specularnormal[4];
4166         float diffuse;
4167         float specular;
4168         float SpecularPower;
4169         float CubeVector[4];
4170         float attenuation;
4171         int d[4];
4172 #if 0
4173         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4174         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4175         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4176         Color_Glow[3] = 0.0f;
4177 #endif
4178         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4179         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4180         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4181         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4182         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4183         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4184         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4185         Color_Diffuse[3] = 0.0f;
4186         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4187         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4188         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4189         Color_Specular[3] = 0.0f;
4190         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4191         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4192         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4193         Color_Pants[3] = 0.0f;
4194         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4195         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4196         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4197         Color_Shirt[3] = 0.0f;
4198         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4199         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4200         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4201         LightColor[3] = 0.0f;
4202         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4203         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4204         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4205         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4206         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4207         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4208         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4209         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4210         {
4211                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4212                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4213         }
4214         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4215                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4216         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4217         {
4218                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4219                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4220                 for (x = startx;x < endx;x++)
4221                 {
4222                         z = buffer_z[x];
4223                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4224                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4225                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4226                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4227                         if (attenuation < 0.01f)
4228                                 continue;
4229                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4230                         {
4231                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4232                                 if (attenuation < 0.01f)
4233                                         continue;
4234                         }
4235
4236                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4237                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4238                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4239                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4240                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4241                         {
4242                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4243                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4244                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4245                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4246                         }
4247                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4248                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4249                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4250                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4251                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4252                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4253                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4254                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4255
4256                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4257                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4258                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4259                         DPSOFTRAST_Vector3Normalize(lightnormal);
4260
4261                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4262
4263                         if(thread->shader_exactspecularmath)
4264                         {
4265                                 // reflect lightnormal at surfacenormal, take the negative of that
4266                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4267                                 float f;
4268                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4269                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4270                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4271                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4272
4273                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4274                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4275                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4276                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4277                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4278
4279                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4280                         }
4281                         else
4282                         {
4283                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4284                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4285                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4286                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4287
4288                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4289                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4290                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4291                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4292
4293                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4294                         }
4295                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4296
4297                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4298                         {
4299                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4300                                 attenuation *= (1.0f / 255.0f);
4301                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4302                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4303                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4304                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4305                         }
4306                         else
4307                         {
4308                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4309                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4310                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4311                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4312                         }
4313                         buffer_FragColorbgra8[x*4+0] = d[0];
4314                         buffer_FragColorbgra8[x*4+1] = d[1];
4315                         buffer_FragColorbgra8[x*4+2] = d[2];
4316                         buffer_FragColorbgra8[x*4+3] = d[3];
4317                 }
4318         }
4319         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4320         {
4321                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4322                 for (x = startx;x < endx;x++)
4323                 {
4324                         z = buffer_z[x];
4325                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4326                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4327                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4328                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4329                         if (attenuation < 0.01f)
4330                                 continue;
4331                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4332                         {
4333                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4334                                 if (attenuation < 0.01f)
4335                                         continue;
4336                         }
4337
4338                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4339                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4340                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4341                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4342                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4343                         {
4344                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4345                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4346                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4347                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4348                         }
4349                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4350                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4351                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4352                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4353
4354                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4355                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4356                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4357                         DPSOFTRAST_Vector3Normalize(lightnormal);
4358
4359                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4360                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4361                         {
4362                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4363                                 attenuation *= (1.0f / 255.0f);
4364                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4365                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4366                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4367                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4368                         }
4369                         else
4370                         {
4371                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4372                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4373                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4374                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4375                         }
4376                         buffer_FragColorbgra8[x*4+0] = d[0];
4377                         buffer_FragColorbgra8[x*4+1] = d[1];
4378                         buffer_FragColorbgra8[x*4+2] = d[2];
4379                         buffer_FragColorbgra8[x*4+3] = d[3];
4380                 }
4381         }
4382         else
4383         {
4384                 for (x = startx;x < endx;x++)
4385                 {
4386                         z = buffer_z[x];
4387                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4388                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4389                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4390                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4391                         if (attenuation < 0.01f)
4392                                 continue;
4393                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4394                         {
4395                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4396                                 if (attenuation < 0.01f)
4397                                         continue;
4398                         }
4399
4400                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4401                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4402                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4403                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4404                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4405                         {
4406                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4407                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4408                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4409                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4410                         }
4411                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4412                         {
4413                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4414                                 attenuation *= (1.0f / 255.0f);
4415                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4416                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4417                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4418                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4419                         }
4420                         else
4421                         {
4422                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4423                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4424                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4425                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4426                         }
4427                         buffer_FragColorbgra8[x*4+0] = d[0];
4428                         buffer_FragColorbgra8[x*4+1] = d[1];
4429                         buffer_FragColorbgra8[x*4+2] = d[2];
4430                         buffer_FragColorbgra8[x*4+3] = d[3];
4431                 }
4432         }
4433         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4434 #endif
4435 }
4436
4437
4438
4439 static void DPSOFTRAST_VertexShader_Refraction(void)
4440 {
4441         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4442         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4443         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4444 }
4445
4446 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4447 {
4448         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4449         //float z;
4450         int x, startx = span->startx, endx = span->endx;
4451
4452         // texture reads
4453         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4454         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4455
4456         // varyings
4457         float ModelViewProjectionPositiondata[4];
4458         float ModelViewProjectionPositionslope[4];
4459
4460         // uniforms
4461         float ScreenScaleRefractReflect[2];
4462         float ScreenCenterRefractReflect[2];
4463         float DistortScaleRefractReflect[2];
4464         float RefractColor[4];
4465
4466         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4467         if(!texture) return;
4468
4469         // read textures
4470         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4471         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4472
4473         // read varyings
4474         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4475
4476         // read uniforms
4477         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4478         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4479         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4480         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4481         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4482         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4483         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4484         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4485         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4486         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4487
4488         // do stuff
4489         for (x = startx;x < endx;x++)
4490         {
4491                 float SafeScreenTexCoord[2];
4492                 float ScreenTexCoord[2];
4493                 float v[3];
4494                 float iw;
4495                 unsigned char c[4];
4496
4497                 //z = buffer_z[x];
4498
4499                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4500                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4501
4502                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4503                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4504                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4505
4506                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4507                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4508                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4509                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4510                 DPSOFTRAST_Vector3Normalize(v);
4511                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4512                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4513
4514                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4515                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4516
4517                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4518                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4519                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4520                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4521         }
4522
4523         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4524 }
4525
4526
4527
4528 static void DPSOFTRAST_VertexShader_Water(void)
4529 {
4530         int i;
4531         int numvertices = dpsoftrast.numvertices;
4532         float EyePosition[4];
4533         float EyeVectorModelSpace[4];
4534         float EyeVector[4];
4535         float position[4];
4536         float svector[4];
4537         float tvector[4];
4538         float normal[4];
4539         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4540         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4541         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4542         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4543         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4544         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4545         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4546         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4547         for (i = 0;i < numvertices;i++)
4548         {
4549                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4550                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4551                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4552                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4553                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4554                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4555                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4556                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4557                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4558                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4559                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4560                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4561                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4562                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4563                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4564                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4565                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4566                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4567                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4568                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4569                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4570                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4571         }
4572         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4573         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4574         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4575 }
4576
4577
4578 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4579 {
4580         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4581         // float z;
4582         int x, startx = span->startx, endx = span->endx;
4583
4584         // texture reads
4585         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4586         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4587
4588         // varyings
4589         float ModelViewProjectionPositiondata[4];
4590         float ModelViewProjectionPositionslope[4];
4591         float EyeVectordata[4];
4592         float EyeVectorslope[4];
4593
4594         // uniforms
4595         float ScreenScaleRefractReflect[4];
4596         float ScreenCenterRefractReflect[4];
4597         float DistortScaleRefractReflect[4];
4598         float RefractColor[4];
4599         float ReflectColor[4];
4600         float ReflectFactor;
4601         float ReflectOffset;
4602
4603         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4604         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4605         if(!texture_refraction || !texture_reflection) return;
4606
4607         // read textures
4608         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4609         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4610
4611         // read varyings
4612         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4613         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4614
4615         // read uniforms
4616         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4617         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4618         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4619         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4620         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4621         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4622         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4623         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4624         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4625         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4626         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4627         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4628         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4629         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4630         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4631         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4632         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4633         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4634         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4635         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4636         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4637         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4638
4639         // do stuff
4640         for (x = startx;x < endx;x++)
4641         {
4642                 float SafeScreenTexCoord[4];
4643                 float ScreenTexCoord[4];
4644                 float v[3];
4645                 float iw;
4646                 unsigned char c1[4];
4647                 unsigned char c2[4];
4648                 float Fresnel;
4649
4650                 // z = buffer_z[x];
4651
4652                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4653                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4654
4655                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4656                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4657                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4658                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4659                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4660
4661                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4662                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4663                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4664                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4665                 DPSOFTRAST_Vector3Normalize(v);
4666                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4667                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4668                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4669                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4670
4671                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4672                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4673                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4674                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4675                 DPSOFTRAST_Vector3Normalize(v);
4676                 Fresnel = 1.0f - v[2];
4677                 Fresnel = min(1.0f, Fresnel);
4678                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4679
4680                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4681                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4682                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4683                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4684
4685                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4686                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4687                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4688                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4689         }
4690
4691         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4692 }
4693
4694
4695
4696 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4697 {
4698         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4699 }
4700
4701 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4702 {
4703         // TODO: IMPLEMENT
4704         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4705         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4706         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4707         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4708         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4709 }
4710
4711
4712
4713 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4714 {
4715         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4716 }
4717
4718 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4719 {
4720         // TODO: IMPLEMENT
4721         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4722         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4723         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4724         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4725         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4726 }
4727
4728
4729
4730 typedef struct DPSOFTRAST_ShaderModeInfo_s
4731 {
4732         int lodarrayindex;
4733         void (*Vertex)(void);
4734         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4735         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4736         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4737 }
4738 DPSOFTRAST_ShaderModeInfo;
4739
4740 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4741 {
4742         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4743         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4744         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4745         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4746         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4747         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4748         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4749         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4750         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4751         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4752         {2, DPSOFTRAST_VertexShader_VertexColor,                        DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4753         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4754         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4755         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4756         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4757         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4758         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4759 };
4760
4761 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4762 {
4763         int x;
4764         int startx;
4765         int endx;
4766         unsigned int *depthpixel;
4767         int depth;
4768         int depthslope;
4769         unsigned int d;
4770         unsigned char *pixelmask;
4771         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4772         startx = span->startx;
4773         endx = span->endx;
4774         depth = span->depthbase;
4775         depthslope = span->depthslope;
4776         pixelmask = thread->pixelmaskarray;
4777         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4778         {
4779                 switch(thread->fb_depthfunc)
4780                 {
4781                 default:
4782                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4783                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4784                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4785                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4786                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4787                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4788                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4789                 }
4790                 while (startx < endx && !pixelmask[startx])
4791                         startx++;
4792                 while (endx > startx && !pixelmask[endx-1])
4793                         endx--;
4794         }
4795         else
4796         {
4797                 // no depth testing means we're just dealing with color...
4798                 memset(pixelmask + startx, 1, endx - startx);
4799         }
4800         span->pixelmask = pixelmask;
4801         span->startx = startx;
4802         span->endx = endx;
4803 }
4804
4805 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4806 {
4807         int x, d, depth, depthslope, startx, endx;
4808         const unsigned char *pixelmask;
4809         unsigned int *depthpixel;
4810         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4811         {
4812                 depth = span->depthbase;
4813                 depthslope = span->depthslope;
4814                 pixelmask = span->pixelmask;
4815                 startx = span->startx;
4816                 endx = span->endx;
4817                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4818                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4819                         if (pixelmask[x])
4820                                 depthpixel[x] = d;
4821         }
4822 }
4823
4824 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4825 {
4826         int i;
4827         DPSOFTRAST_State_Triangle *triangle;
4828         DPSOFTRAST_State_Span *span;
4829         for (i = 0; i < thread->numspans; i++)
4830         {
4831                 span = &thread->spans[i];
4832                 triangle = &thread->triangles[span->triangle];
4833                 DPSOFTRAST_Draw_DepthTest(thread, span);
4834                 if (span->startx >= span->endx)
4835                         continue;
4836                 // run pixel shader if appropriate
4837                 // do this before running depthmask code, to allow the pixelshader
4838                 // to clear pixelmask values for alpha testing
4839                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4840                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4841                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4842         }
4843         thread->numspans = 0;
4844 }
4845
4846 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4847
4848 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4849 {
4850 #ifdef SSE_POSSIBLE
4851         int cullface = thread->cullface;
4852         int minx, maxx, miny, maxy;
4853         int miny1, maxy1, miny2, maxy2;
4854         __m128i fbmin, fbmax;
4855         __m128 viewportcenter, viewportscale;
4856         int firstvertex = command->firstvertex;
4857         int numvertices = command->numvertices;
4858         int numtriangles = command->numtriangles;
4859         const int *element3i = command->element3i;
4860         const unsigned short *element3s = command->element3s;
4861         int clipped = command->clipped;
4862         int i;
4863         int j;
4864         int k;
4865         int y;
4866         int e[3];
4867         __m128i screeny;
4868         int starty, endy, bandy;
4869         int numpoints;
4870         int clipcase;
4871         float clipdist[4];
4872         float clip0origin, clip0slope;
4873         int clip0dir;
4874         __m128 triangleedge1, triangleedge2, trianglenormal;
4875         __m128 clipfrac[3];
4876         __m128 screen[4];
4877         DPSOFTRAST_State_Triangle *triangle;
4878         DPSOFTRAST_Texture *texture;
4879         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4880         miny = thread->fb_scissor[1];
4881         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4882         miny1 = bound(miny, thread->miny1, maxy);
4883         maxy1 = bound(miny, thread->maxy1, maxy);
4884         miny2 = bound(miny, thread->miny2, maxy);
4885         maxy2 = bound(miny, thread->maxy2, maxy);
4886         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4887         {
4888                 if (!ATOMIC_DECREMENT(command->refcount))
4889                 {
4890                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4891                                 MM_FREE(command->arrays);
4892                 }
4893                 return;
4894         }
4895         minx = thread->fb_scissor[0];
4896         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4897         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4898         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4899         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4900         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4901         screen[3] = _mm_setzero_ps();
4902         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4903         for (i = 0;i < numtriangles;i++)
4904         {
4905                 const float *screencoord4f = command->arrays;
4906                 const float *arrays = screencoord4f + numvertices*4;
4907
4908                 // generate the 3 edges of this triangle
4909                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4910                 if (element3s)
4911                 {
4912                         e[0] = element3s[i*3+0] - firstvertex;
4913                         e[1] = element3s[i*3+1] - firstvertex;
4914                         e[2] = element3s[i*3+2] - firstvertex;
4915                 }
4916                 else if (element3i)
4917                 {
4918                         e[0] = element3i[i*3+0] - firstvertex;
4919                         e[1] = element3i[i*3+1] - firstvertex;
4920                         e[2] = element3i[i*3+2] - firstvertex;
4921                 }
4922                 else
4923                 {
4924                         e[0] = i*3+0;
4925                         e[1] = i*3+1;
4926                         e[2] = i*3+2;
4927                 }
4928
4929 #define SKIPBACKFACE \
4930                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4931                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4932                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4933                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4934                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4935                 switch(cullface) \
4936                 { \
4937                 case GL_BACK: \
4938                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4939                                 continue; \
4940                         break; \
4941                 case GL_FRONT: \
4942                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4943                                 continue; \
4944                         break; \
4945                 }
4946
4947 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4948                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4949                         { \
4950                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4951                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4952                         }
4953 #define CLIPPEDVERTEXCOPY(k,p1) \
4954                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4955
4956 #define GENATTRIBCOPY(attrib, p1) \
4957                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4958 #define GENATTRIBLERP(attrib, p1, p2) \
4959                 { \
4960                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4961                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4962                 }
4963 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4964                 switch(clipcase) \
4965                 { \
4966                 default: \
4967                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4968                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4969                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4970                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4971                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4972                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4973                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4974                 }
4975
4976                 if (! clipped)
4977                         goto notclipped;
4978
4979                 // calculate distance from nearplane
4980                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4981                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4982                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4983                 if (clipdist[0] >= 0.0f)
4984                 {
4985                         if (clipdist[1] >= 0.0f)
4986                         {
4987                                 if (clipdist[2] >= 0.0f)
4988                                 {
4989                                 notclipped:
4990                                         // triangle is entirely in front of nearplane
4991                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4992                                         SKIPBACKFACE;
4993                                         numpoints = 3;
4994                                         clipcase = 0;
4995                                 }
4996                                 else
4997                                 {
4998                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4999                                         SKIPBACKFACE;
5000                                         numpoints = 4;
5001                                         clipcase = 1;
5002                                 }
5003                         }
5004                         else
5005                         {
5006                                 if (clipdist[2] >= 0.0f)
5007                                 {
5008                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5009                                         SKIPBACKFACE;
5010                                         numpoints = 4;
5011                                         clipcase = 2;
5012                                 }
5013                                 else
5014                                 {
5015                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5016                                         SKIPBACKFACE;
5017                                         numpoints = 3;
5018                                         clipcase = 3;
5019                                 }
5020                         }
5021                 }
5022                 else if (clipdist[1] >= 0.0f)
5023                 {
5024                         if (clipdist[2] >= 0.0f)
5025                         {
5026                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5027                                 SKIPBACKFACE;
5028                                 numpoints = 4;
5029                                 clipcase = 4;
5030                         }
5031                         else
5032                         {
5033                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5034                                 SKIPBACKFACE;
5035                                 numpoints = 3;
5036                                 clipcase = 5;
5037                         }
5038                 }
5039                 else if (clipdist[2] >= 0.0f)
5040                 {
5041                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5042                         SKIPBACKFACE;
5043                         numpoints = 3;
5044                         clipcase = 6;
5045                 }
5046                 else continue; // triangle is entirely behind nearplane
5047
5048                 {
5049                         // calculate integer y coords for triangle points
5050                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5051                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5052                                         screenmin = _mm_min_epi16(screeni, screenir),
5053                                         screenmax = _mm_max_epi16(screeni, screenir);
5054                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5055                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5056                         screenmin = _mm_max_epi16(screenmin, fbmin);
5057                         screenmax = _mm_min_epi16(screenmax, fbmax);
5058                         // skip offscreen triangles
5059                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5060                                 continue;
5061                         starty = _mm_extract_epi16(screenmin, 1);
5062                         endy = _mm_extract_epi16(screenmax, 1)+1;
5063                         if (starty >= maxy1 && endy <= miny2)
5064                                 continue;
5065                         screeny = _mm_srai_epi32(screeni, 16);
5066                 }
5067
5068                 triangle = &thread->triangles[thread->numtriangles];
5069
5070                 // calculate attribute plans for triangle data...
5071                 // okay, this triangle is going to produce spans, we'd better project
5072                 // the interpolants now (this is what gives perspective texturing),
5073                 // this consists of simply multiplying all arrays by the W coord
5074                 // (which is basically 1/Z), which will be undone per-pixel
5075                 // (multiplying by Z again) to get the perspective-correct array
5076                 // values
5077                 {
5078                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5079                         __m128 mipedgescale, mipdensity;
5080                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5081                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5082                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5083                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5084                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5085                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5086                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5087                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5088                         attribedge1 = _mm_sub_ss(w0, w1);
5089                         attribedge2 = _mm_sub_ss(w2, w1);
5090                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5091                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5092                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5093                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5094                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5095                         _mm_store_ss(&triangle->w[0], attribxslope);
5096                         _mm_store_ss(&triangle->w[1], attribyslope);
5097                         _mm_store_ss(&triangle->w[2], attriborigin);
5098                         
5099                         clip0origin = 0;
5100                         clip0slope = 0;
5101                         clip0dir = 0;
5102                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5103                         {
5104                                 float cliporigin, clipxslope, clipyslope;
5105                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5106                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5107                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5108                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5109                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5110                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5111                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5112                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5113                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5114                                 if(clipxslope != 0)
5115                                 {
5116                                         clip0origin = -cliporigin/clipxslope;
5117                                         clip0slope = -clipyslope/clipxslope;
5118                                         clip0dir = clipxslope > 0 ? 1 : -1;
5119                                 }
5120                                 else if(clipyslope > 0)
5121                                 {
5122                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5123                                         clip0slope = dpsoftrast.fb_width;
5124                                         clip0dir = -1;
5125                                 }
5126                                 else if(clipyslope < 0)
5127                                 {
5128                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5129                                         clip0slope = -dpsoftrast.fb_width;
5130                                         clip0dir = -1;
5131                                 }
5132                                 else if(clip0origin < 0) continue;
5133                         }
5134
5135                         mipedgescale = _mm_setzero_ps();
5136                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5137                         {
5138                                 __m128 attrib0, attrib1, attrib2;
5139                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5140                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5141                                         break;
5142                                 arrays += numvertices*4;
5143                                 GENATTRIBS(attrib0, attrib1, attrib2);
5144                                 attriborigin = _mm_mul_ps(attrib1, w1);
5145                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5146                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5147                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5148                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5149                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5150                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5151                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5152                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5153                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5154                                 {
5155                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5156                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5157                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5158                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5159                                 }
5160                         }
5161
5162                         memset(triangle->mip, 0, sizeof(triangle->mip));
5163                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5164                         {
5165                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5166                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5167                                         break;
5168                                 texture = thread->texbound[texunit];
5169                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5170                                 {
5171                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5172                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5173                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5174                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5175                                         // this will be multiplied in the texturing routine by the texture resolution
5176                                         y = _mm_cvtss_si32(mipdensity);
5177                                         if (y > 0)
5178                                         {
5179                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5180                                                 if (y > texture->mipmaps - 1)
5181                                                         y = texture->mipmaps - 1;
5182                                                 triangle->mip[texunit] = y;
5183                                         }
5184                                 }
5185                         }
5186                 }
5187         
5188                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5189                 for (; y < bandy;)
5190                 {
5191                         __m128 xcoords, xslope;
5192                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5193                         int yccmask = _mm_movemask_epi8(ycc);
5194                         int edge0p, edge0n, edge1p, edge1n;
5195                         int nexty;
5196                         float w, wslope;
5197                         float clip0;
5198                         if (numpoints == 4)
5199                         {
5200                                 switch(yccmask)
5201                                 {
5202                                 default:
5203                                 case 0xFFFF: /*0000*/ y = endy; continue;
5204                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5205                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5206                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5207                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5208                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5209                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5210                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5211                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5212                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5213                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5214                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5215                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5216                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5217                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5218                                 case 0x0000: /*1111*/ y++; continue;
5219                                 }
5220                         }
5221                         else
5222                         {
5223                                 switch(yccmask)
5224                                 {
5225                                 default:
5226                                 case 0xFFFF: /*000*/ y = endy; continue;
5227                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5228                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5229                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5230                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5231                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5232                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5233                                 case 0x0000: /*111*/ y++; continue;
5234                                 }
5235                         }
5236                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5237                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5238                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5239                         nexty = _mm_extract_epi16(ycc, 0);
5240                         if (nexty >= bandy) nexty = bandy-1;
5241                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5242                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5243                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5244                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5245                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5246                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5247                         {
5248                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5249                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5250                         }
5251                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5252                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5253                         {
5254                                 int startx, endx, offset;
5255                                 startx = _mm_cvtss_si32(xcoords);
5256                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5257                                 if (startx < minx) startx = minx;
5258                                 if (endx > maxx) endx = maxx;
5259                                 if (startx >= endx) continue;
5260
5261                                 if (clip0dir)
5262                                 {
5263                                         if (clip0dir > 0)
5264                                         {
5265                                                 if (startx < clip0) 
5266                                                 {
5267                                                         if(endx <= clip0) continue;
5268                                                         startx = (int)clip0;
5269                                                 }
5270                                         }
5271                                         else if (endx > clip0) 
5272                                         {
5273                                                 if(startx >= clip0) continue;
5274                                                 endx = (int)clip0;
5275                                         }
5276                                 }
5277                                                 
5278                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5279                                 {
5280                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5281                                         span->triangle = thread->numtriangles;
5282                                         span->x = offset;
5283                                         span->y = y;
5284                                         span->startx = 0;
5285                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5286                                         if (span->startx >= span->endx)
5287                                                 continue;
5288                                         wslope = triangle->w[0];
5289                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5290                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5291                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5292                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5293                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5294                                 }
5295                         }
5296                 }
5297
5298                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5299                 {
5300                         DPSOFTRAST_Draw_ProcessSpans(thread);
5301                         thread->numtriangles = 0;
5302                 }
5303         }
5304
5305         if (!ATOMIC_DECREMENT(command->refcount))
5306         {
5307                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5308                         MM_FREE(command->arrays);
5309         }
5310
5311         if (thread->numspans > 0 || thread->numtriangles > 0)
5312         {
5313                 DPSOFTRAST_Draw_ProcessSpans(thread);
5314                 thread->numtriangles = 0;
5315         }
5316 #endif
5317 }
5318
5319 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5320 {
5321         int i;
5322         int j;
5323         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5324         int datasize = 2*numvertices*sizeof(float[4]);
5325         DPSOFTRAST_Command_Draw *command;
5326         unsigned char *data;
5327         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5328         {
5329                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5330                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5331                         break;
5332                 datasize += numvertices*sizeof(float[4]);
5333         }
5334         if (element3s)
5335                 datasize += numtriangles*sizeof(unsigned short[3]);
5336         else if (element3i)
5337                 datasize += numtriangles*sizeof(int[3]);
5338         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5339         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5340         {
5341                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5342                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5343         }
5344         else
5345         {
5346                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5347                 data = (unsigned char *)command + commandsize;
5348         }
5349         command->firstvertex = firstvertex;
5350         command->numvertices = numvertices;
5351         command->numtriangles = numtriangles;
5352         command->arrays = (float *)data;
5353         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5354         dpsoftrast.firstvertex = firstvertex;
5355         dpsoftrast.numvertices = numvertices;
5356         dpsoftrast.screencoord4f = (float *)data;
5357         data += numvertices*sizeof(float[4]);
5358         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5359         data += numvertices*sizeof(float[4]);
5360         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5361         {
5362                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5363                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5364                         break;
5365                 dpsoftrast.post_array4f[j] = (float *)data;
5366                 data += numvertices*sizeof(float[4]);
5367         }
5368         command->element3i = NULL;
5369         command->element3s = NULL;
5370         if (element3s)
5371         {
5372                 command->element3s = (unsigned short *)data;
5373                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5374         }
5375         else if (element3i)
5376         {
5377                 command->element3i = (int *)data;
5378                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5379         }
5380         return command;
5381 }
5382
5383 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5384 {
5385         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5386         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5387         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5388         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5389         if (command->starty >= command->endy)
5390         {
5391                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5392                         MM_FREE(command->arrays);
5393                 DPSOFTRAST_UndoCommand(command->commandsize);
5394                 return;
5395         }
5396         command->clipped = dpsoftrast.drawclipped;
5397         command->refcount = dpsoftrast.numthreads;
5398
5399         if (dpsoftrast.usethreads)
5400         {
5401                 int i;
5402                 DPSOFTRAST_Draw_SyncCommands();
5403                 for (i = 0; i < dpsoftrast.numthreads; i++)
5404                 {
5405                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5406                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5407                                 Thread_CondSignal(thread->drawcond);
5408                 }
5409         }
5410         else
5411         {
5412                 DPSOFTRAST_Draw_FlushThreads();
5413         }
5414 }
5415
5416 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5417 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5418 {
5419         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5420 }
5421 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5422 {
5423         DPSOFTRAST_Command_SetRenderTargets *command;
5424         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5425                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5426                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5427                 DPSOFTRAST_Flush();
5428         dpsoftrast.fb_width = width;
5429         dpsoftrast.fb_height = height;
5430         dpsoftrast.fb_depthpixels = depthpixels;
5431         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5432         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5433         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5434         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5435         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5436         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5437         command->width = width;
5438         command->height = height;
5439 }
5440  
5441 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5442 {
5443         int commandoffset = thread->commandoffset;
5444         while (commandoffset != endoffset)
5445         {
5446                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5447                 switch (command->opcode)
5448                 {
5449 #define INTERPCOMMAND(name) \
5450                 case DPSOFTRAST_OPCODE_##name : \
5451                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5452                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5453                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5454                                 commandoffset = 0; \
5455                         break;
5456                 INTERPCOMMAND(Viewport)
5457                 INTERPCOMMAND(ClearColor)
5458                 INTERPCOMMAND(ClearDepth)
5459                 INTERPCOMMAND(ColorMask)
5460                 INTERPCOMMAND(DepthTest)
5461                 INTERPCOMMAND(ScissorTest)
5462                 INTERPCOMMAND(Scissor)
5463                 INTERPCOMMAND(BlendFunc)
5464                 INTERPCOMMAND(BlendSubtract)
5465                 INTERPCOMMAND(DepthMask)
5466                 INTERPCOMMAND(DepthFunc)
5467                 INTERPCOMMAND(DepthRange)
5468                 INTERPCOMMAND(PolygonOffset)
5469                 INTERPCOMMAND(CullFace)
5470                 INTERPCOMMAND(SetTexture)
5471                 INTERPCOMMAND(SetShader)
5472                 INTERPCOMMAND(Uniform4f)
5473                 INTERPCOMMAND(UniformMatrix4f)
5474                 INTERPCOMMAND(Uniform1i)
5475                 INTERPCOMMAND(SetRenderTargets)
5476                 INTERPCOMMAND(ClipPlane)
5477
5478                 case DPSOFTRAST_OPCODE_Draw:
5479                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5480                         commandoffset += command->commandsize;
5481                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5482                                 commandoffset = 0;
5483                         thread->commandoffset = commandoffset;
5484                         break;
5485
5486                 case DPSOFTRAST_OPCODE_Reset:
5487                         commandoffset = 0;
5488                         break;
5489                 }
5490         }
5491         thread->commandoffset = commandoffset;
5492 }
5493
5494 static int DPSOFTRAST_Draw_Thread(void *data)
5495 {
5496         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5497         while(thread->index >= 0)
5498         {
5499                 if (thread->commandoffset != dpsoftrast.drawcommand)
5500                 {
5501                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5502                 }
5503                 else 
5504                 {
5505                         Thread_LockMutex(thread->drawmutex);
5506                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5507                         {
5508                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5509                                 thread->starving = true;
5510                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5511                                 thread->starving = false;
5512                         }
5513                         Thread_UnlockMutex(thread->drawmutex);
5514                 }
5515         }   
5516         return 0;
5517 }
5518
5519 static void DPSOFTRAST_Draw_FlushThreads(void)
5520 {
5521         DPSOFTRAST_State_Thread *thread;
5522         int i;
5523         DPSOFTRAST_Draw_SyncCommands();
5524         if (dpsoftrast.usethreads) 
5525         {
5526                 for (i = 0; i < dpsoftrast.numthreads; i++)
5527                 {
5528                         thread = &dpsoftrast.threads[i];
5529                         if (thread->commandoffset != dpsoftrast.drawcommand)
5530                         {
5531                                 Thread_LockMutex(thread->drawmutex);
5532                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5533                                         Thread_CondSignal(thread->drawcond);
5534                                 Thread_UnlockMutex(thread->drawmutex);
5535                         }
5536                 }
5537                 for (i = 0; i < dpsoftrast.numthreads; i++)
5538                 {
5539                         thread = &dpsoftrast.threads[i];
5540                         if (thread->commandoffset != dpsoftrast.drawcommand)
5541                         {
5542                                 Thread_LockMutex(thread->drawmutex);
5543                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5544                                 {
5545                                         thread->waiting = true;
5546                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5547                                         thread->waiting = false;
5548                                 }
5549                                 Thread_UnlockMutex(thread->drawmutex);
5550                         }
5551                 }
5552         }
5553         else
5554         {
5555                 for (i = 0; i < dpsoftrast.numthreads; i++)
5556                 {
5557                         thread = &dpsoftrast.threads[i];
5558                         if (thread->commandoffset != dpsoftrast.drawcommand)
5559                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5560                 }
5561         }
5562         dpsoftrast.commandpool.usedcommands = 0;
5563 }
5564
5565 void DPSOFTRAST_Flush(void)
5566 {
5567         DPSOFTRAST_Draw_FlushThreads();
5568 }
5569
5570 void DPSOFTRAST_Finish(void)
5571 {
5572         DPSOFTRAST_Flush();
5573 }
5574
5575 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5576 {
5577         int i;
5578         union
5579         {
5580                 int i;
5581                 unsigned char b[4];
5582         }
5583         u;
5584         u.i = 1;
5585         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5586         dpsoftrast.bigendian = u.b[3];
5587         dpsoftrast.fb_width = width;
5588         dpsoftrast.fb_height = height;
5589         dpsoftrast.fb_depthpixels = depthpixels;
5590         dpsoftrast.fb_colorpixels[0] = colorpixels;
5591         dpsoftrast.fb_colorpixels[1] = NULL;
5592         dpsoftrast.fb_colorpixels[1] = NULL;
5593         dpsoftrast.fb_colorpixels[1] = NULL;
5594         dpsoftrast.viewport[0] = 0;
5595         dpsoftrast.viewport[1] = 0;
5596         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5597         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5598         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5599         dpsoftrast.texture_firstfree = 1;
5600         dpsoftrast.texture_end = 1;
5601         dpsoftrast.texture_max = 0;
5602         dpsoftrast.color[0] = 1;
5603         dpsoftrast.color[1] = 1;
5604         dpsoftrast.color[2] = 1;
5605         dpsoftrast.color[3] = 1;
5606         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5607         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5608         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5609         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5610         for (i = 0; i < dpsoftrast.numthreads; i++)
5611         {
5612                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5613                 thread->index = i;
5614                 thread->cullface = GL_BACK;
5615         thread->colormask[0] = 1; 
5616                 thread->colormask[1] = 1;
5617                 thread->colormask[2] = 1;
5618                 thread->colormask[3] = 1;
5619                 thread->blendfunc[0] = GL_ONE;
5620                 thread->blendfunc[1] = GL_ZERO;
5621                 thread->depthmask = true;
5622                 thread->depthtest = true;
5623                 thread->depthfunc = GL_LEQUAL;
5624                 thread->scissortest = false;
5625                 thread->viewport[0] = 0;
5626                 thread->viewport[1] = 0;
5627                 thread->viewport[2] = dpsoftrast.fb_width;
5628                 thread->viewport[3] = dpsoftrast.fb_height;
5629                 thread->scissor[0] = 0;
5630                 thread->scissor[1] = 0;
5631                 thread->scissor[2] = dpsoftrast.fb_width;
5632                 thread->scissor[3] = dpsoftrast.fb_height;
5633                 thread->depthrange[0] = 0;
5634                 thread->depthrange[1] = 1;
5635                 thread->polygonoffset[0] = 0;
5636                 thread->polygonoffset[1] = 0;
5637                 thread->clipplane[0] = 0;
5638                 thread->clipplane[1] = 0;
5639                 thread->clipplane[2] = 0;
5640                 thread->clipplane[3] = 1;
5641         
5642                 thread->numspans = 0;
5643                 thread->numtriangles = 0;
5644                 thread->commandoffset = 0;
5645                 thread->waiting = false;
5646                 thread->starving = false;
5647            
5648                 thread->validate = -1;
5649                 DPSOFTRAST_Validate(thread, -1);
5650  
5651                 if (dpsoftrast.usethreads)
5652                 {
5653                         thread->waitcond = Thread_CreateCond();
5654                         thread->drawcond = Thread_CreateCond();
5655                         thread->drawmutex = Thread_CreateMutex();
5656                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5657                 }
5658         }
5659         return 0;
5660 }
5661
5662 void DPSOFTRAST_Shutdown(void)
5663 {
5664         int i;
5665         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5666         {
5667                 DPSOFTRAST_State_Thread *thread;
5668                 for (i = 0; i < dpsoftrast.numthreads; i++)
5669                 {
5670                         thread = &dpsoftrast.threads[i];
5671                         Thread_LockMutex(thread->drawmutex);
5672                         thread->index = -1;
5673                         Thread_CondSignal(thread->drawcond);
5674                         Thread_UnlockMutex(thread->drawmutex);
5675                         Thread_WaitThread(thread->thread, 0);
5676                         Thread_DestroyCond(thread->waitcond);
5677                         Thread_DestroyCond(thread->drawcond);
5678                         Thread_DestroyMutex(thread->drawmutex);
5679                 }
5680         }
5681         for (i = 0;i < dpsoftrast.texture_end;i++)
5682                 if (dpsoftrast.texture[i].bytes)
5683                         MM_FREE(dpsoftrast.texture[i].bytes);
5684         if (dpsoftrast.texture)
5685                 free(dpsoftrast.texture);
5686         if (dpsoftrast.threads)
5687                 MM_FREE(dpsoftrast.threads);
5688         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5689 }
5690