]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
fix compilation of dpsoftrast on mingw32 (use Interlocked* instead of __sync_*)
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
37                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
38                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
39         #elif defined(__GNUC__)
40                 #define ALIGN(var) var __attribute__((__aligned__(16)))
41                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(__sync_synchronize())
44                 #define ATOMIC_COUNTER volatile int
45                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
46                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
47                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
48         #elif defined(_MSC_VER)
49                 #define ALIGN(var) __declspec(align(16)) var
50                 #define ATOMIC(var) __declspec(align(32)) var
51                 #define MEMORY_BARRIER (_mm_sfence())
52                 //(MemoryBarrier())
53                 #define ATOMIC_COUNTER volatile LONG
54                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
55                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
56                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
57         #endif
58 #endif
59
60 #ifndef ALIGN
61 #define ALIGN(var) var
62 #endif
63 #ifndef ATOMIC
64 #define ATOMIC(var) var
65 #endif
66 #ifndef MEMORY_BARRIER
67 #define MEMORY_BARRIER ((void)0)
68 #endif
69 #ifndef ATOMIC_COUNTER
70 #define ATOMIC_COUNTER int
71 #endif
72 #ifndef ATOMIC_INCREMENT
73 #define ATOMIC_INCREMENT(counter) (++(counter))
74 #endif
75 #ifndef ATOMIC_DECREMENT
76 #define ATOMIC_DECREMENT(counter) (--(counter))
77 #endif
78 #ifndef ATOMIC_ADD
79 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
80 #endif
81
82 #ifdef SSE_POSSIBLE
83 #include <emmintrin.h>
84
85 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
86         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
87 #endif
88
89 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
90
91 static void *MM_CALLOC(size_t nmemb, size_t size)
92 {
93         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
94         if (ptr != NULL) memset(ptr, 0, nmemb*size);
95         return ptr;
96 }
97
98 #define MM_FREE _mm_free
99 #else
100 #define MM_MALLOC(size) malloc(size)
101 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
102 #define MM_FREE free
103 #endif
104
105 typedef enum DPSOFTRAST_ARRAY_e
106 {
107         DPSOFTRAST_ARRAY_POSITION,
108         DPSOFTRAST_ARRAY_COLOR,
109         DPSOFTRAST_ARRAY_TEXCOORD0,
110         DPSOFTRAST_ARRAY_TEXCOORD1,
111         DPSOFTRAST_ARRAY_TEXCOORD2,
112         DPSOFTRAST_ARRAY_TEXCOORD3,
113         DPSOFTRAST_ARRAY_TEXCOORD4,
114         DPSOFTRAST_ARRAY_TEXCOORD5,
115         DPSOFTRAST_ARRAY_TEXCOORD6,
116         DPSOFTRAST_ARRAY_TEXCOORD7,
117         DPSOFTRAST_ARRAY_TOTAL
118 }
119 DPSOFTRAST_ARRAY;
120
121 typedef struct DPSOFTRAST_Texture_s
122 {
123         int flags;
124         int width;
125         int height;
126         int depth;
127         int sides;
128         DPSOFTRAST_TEXTURE_FILTER filter;
129         int mipmaps;
130         int size;
131         ATOMIC_COUNTER binds;
132         unsigned char *bytes;
133         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
134 }
135 DPSOFTRAST_Texture;
136
137 #define COMMAND_SIZE ALIGN_SIZE
138 #define COMMAND_ALIGN(var) ALIGN(var)
139
140 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
141 {
142         unsigned char opcode;
143         unsigned short commandsize;
144 }
145 DPSOFTRAST_Command);
146
147 enum { DPSOFTRAST_OPCODE_Reset = 0 };
148
149 #define DEFCOMMAND(opcodeval, name, fields) \
150         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
151         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
152         { \
153                 unsigned char opcode; \
154                 unsigned short commandsize; \
155                 fields \
156         } DPSOFTRAST_Command_##name );
157
158 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
159 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
160
161 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
162 {
163         int freecommand;
164         int usedcommands;
165         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
166 }
167 DPSOFTRAST_State_Command_Pool);
168
169 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
170 {
171         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
172         float w[3];
173         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
174 }
175 DPSOFTRAST_State_Triangle);
176
177 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
178         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
179         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
180                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
181                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
182 }
183 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
184         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
185         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
186         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
187         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
188         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
189         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
190         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
191         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
192 }
193                                         
194 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
195
196 typedef ALIGN(struct DPSOFTRAST_State_Span_s
197 {
198         int triangle; // triangle this span was generated by
199         int x; // framebuffer x coord
200         int y; // framebuffer y coord
201         int startx; // usable range (according to pixelmask)
202         int endx; // usable range (according to pixelmask)
203         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
204         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
205         int depthslope; // depthbuffer value pixel delta
206 }
207 DPSOFTRAST_State_Span);
208
209 #define DPSOFTRAST_DRAW_MAXSPANS 1024
210 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
211 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
212
213 #define DPSOFTRAST_VALIDATE_FB 1
214 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
215 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
216 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
217
218 typedef enum DPSOFTRAST_BLENDMODE_e
219 {
220         DPSOFTRAST_BLENDMODE_OPAQUE,
221         DPSOFTRAST_BLENDMODE_ALPHA,
222         DPSOFTRAST_BLENDMODE_ADDALPHA,
223         DPSOFTRAST_BLENDMODE_ADD,
224         DPSOFTRAST_BLENDMODE_INVMOD,
225         DPSOFTRAST_BLENDMODE_MUL,
226         DPSOFTRAST_BLENDMODE_MUL2,
227         DPSOFTRAST_BLENDMODE_SUBALPHA,
228         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
229         DPSOFTRAST_BLENDMODE_INVADD,
230         DPSOFTRAST_BLENDMODE_TOTAL
231 }
232 DPSOFTRAST_BLENDMODE;
233
234 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
235 {
236         void *thread;
237         int index;
238         
239         int cullface;
240         int colormask[4];
241         int blendfunc[2];
242         int blendsubtract;
243         int depthmask;
244         int depthtest;
245         int depthfunc;
246         int scissortest;
247         int alphatest;
248         int alphafunc;
249         float alphavalue;
250         int viewport[4];
251         int scissor[4];
252         float depthrange[2];
253         float polygonoffset[2];
254         float clipplane[4];
255         ALIGN(float fb_clipplane[4]);
256
257         int shader_mode;
258         int shader_permutation;
259         int shader_exactspecularmath;
260
261         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
262         
263         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
264         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
265
266         // DPSOFTRAST_VALIDATE_ flags
267         int validate;
268
269         // derived values (DPSOFTRAST_VALIDATE_FB)
270         int fb_colormask;
271         int fb_scissor[4];
272         ALIGN(float fb_viewportcenter[4]);
273         ALIGN(float fb_viewportscale[4]);
274
275         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
276         int fb_depthfunc;
277
278         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
279         int fb_blendmode;
280
281         // band boundaries
282         int miny1;
283         int maxy1;
284         int miny2;
285         int maxy2;
286
287         ATOMIC(volatile int commandoffset);
288
289         volatile bool waiting;
290         volatile bool starving;
291         void *waitcond;
292         void *drawcond;
293         void *drawmutex;
294
295         int numspans;
296         int numtriangles;
297         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
298         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
299         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
300 }
301 DPSOFTRAST_State_Thread);
302
303 typedef ATOMIC(struct DPSOFTRAST_State_s
304 {
305         int fb_width;
306         int fb_height;
307         unsigned int *fb_depthpixels;
308         unsigned int *fb_colorpixels[4];
309
310         int viewport[4];
311         ALIGN(float fb_viewportcenter[4]);
312         ALIGN(float fb_viewportscale[4]);
313
314         float color[4];
315         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
316         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
317
318         const float *pointer_vertex3f;
319         const float *pointer_color4f;
320         const unsigned char *pointer_color4ub;
321         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
322         int stride_vertex;
323         int stride_color;
324         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
325         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
326         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
327
328         int firstvertex;
329         int numvertices;
330         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
331         float *screencoord4f;
332         int drawstarty;
333         int drawendy;
334         int drawclipped;
335         
336         int shader_mode;
337         int shader_permutation;
338         int shader_exactspecularmath;
339
340         int texture_max;
341         int texture_end;
342         int texture_firstfree;
343         DPSOFTRAST_Texture *texture;
344
345         int bigendian;
346
347         // error reporting
348         const char *errorstring;
349
350         bool usethreads;
351         int interlace;
352         int numthreads;
353         DPSOFTRAST_State_Thread *threads;
354
355         ATOMIC(volatile int drawcommand);
356
357         DPSOFTRAST_State_Command_Pool commandpool;
358 }
359 DPSOFTRAST_State);
360
361 DPSOFTRAST_State dpsoftrast;
362
363 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
364 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
365 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
366 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
367
368 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
369 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
370
371 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
372 {
373         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
374         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
375         fb_viewportcenter[3] = 0.5f;
376         fb_viewportcenter[0] = 0.0f;
377         fb_viewportscale[1] = 0.5f * viewport[2];
378         fb_viewportscale[2] = -0.5f * viewport[3];
379         fb_viewportscale[3] = 0.5f;
380         fb_viewportscale[0] = 1.0f;
381 }
382
383 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
384 {
385         if (dpsoftrast.interlace)
386         {
387                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
388                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
389                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391         }
392         else
393         {
394                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
395                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
396         }
397 }
398
399 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
400 {
401         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
402         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
403         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
404         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
405         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
406 }
407
408 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
409 {
410         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
411         // and viewport projection values
412         int x1, x2;
413         int y1, y2;
414         x1 = thread->scissor[0];
415         x2 = thread->scissor[0] + thread->scissor[2];
416         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
417         y2 = dpsoftrast.fb_height - thread->scissor[1];
418         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
419         if (x1 < 0) x1 = 0;
420         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
421         if (y1 < 0) y1 = 0;
422         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
423         thread->fb_scissor[0] = x1;
424         thread->fb_scissor[1] = y1;
425         thread->fb_scissor[2] = x2 - x1;
426         thread->fb_scissor[3] = y2 - y1;
427
428         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
429         DPSOFTRAST_RecalcClipPlane(thread);
430         DPSOFTRAST_RecalcThread(thread);
431 }
432
433 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
434 {
435         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
436 }
437
438 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
439 {
440         if (thread->blendsubtract)
441         {
442                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
443                 {
444                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
445                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
446                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
447                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
448                 }
449         }
450         else
451         {       
452                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
453                 {
454                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
455                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
456                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
457                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
458                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
459                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
460                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
461                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
462                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
463                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
464                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
465                 }
466         }
467 }
468
469 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
470
471 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
472 {
473         mask &= thread->validate;
474         if (!mask)
475                 return;
476         if (mask & DPSOFTRAST_VALIDATE_FB)
477         {
478                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
479                 DPSOFTRAST_RecalcFB(thread);
480         }
481         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
482         {
483                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
484                 DPSOFTRAST_RecalcDepthFunc(thread);
485         }
486         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
487         {
488                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
489                 DPSOFTRAST_RecalcBlendFunc(thread);
490         }
491 }
492
493 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
494 {
495         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
496                 return &dpsoftrast.texture[index];
497         return NULL;
498 }
499
500 static void DPSOFTRAST_Texture_Grow(void)
501 {
502         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
503         DPSOFTRAST_State_Thread *thread;
504         int i;
505         int j;
506         DPSOFTRAST_Flush();
507         // expand texture array as needed
508         if (dpsoftrast.texture_max < 1024)
509                 dpsoftrast.texture_max = 1024;
510         else
511                 dpsoftrast.texture_max *= 2;
512         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
513         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
514                 if (dpsoftrast.texbound[i])
515                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
516         for (j = 0; j < dpsoftrast.numthreads; j++)
517         {
518                 thread = &dpsoftrast.threads[j];
519                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
520                         if (thread->texbound[i])
521                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
522         }
523 }
524
525 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
526 {
527         int w;
528         int h;
529         int d;
530         int size;
531         int s;
532         int texnum;
533         int mipmaps;
534         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
535         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
536         DPSOFTRAST_Texture *texture;
537         if (width*height*depth < 1)
538         {
539                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
540                 return 0;
541         }
542         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
543         {
544                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
545                 return 0;
546         }
547         switch(texformat)
548         {
549         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
550         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
551         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
552                 break;
553         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
554                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
555                 {
556                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
557                         return 0;
558                 }
559                 if (depth != 1)
560                 {
561                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
562                         return 0;
563                 }
564                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
565                 {
566                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
567                         return 0;
568                 }
569                 break;
570         }
571         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
572         {
573                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
574                 return 0;
575         }
576         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
577         {
578                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
579                 return 0;
580         }
581         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
582         {
583                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
584                 return 0;
585         }
586         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
587         {
588                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
589                 return 0;
590         }
591         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
592         {
593                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
594                 return 0;
595         }
596         // find first empty slot in texture array
597         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
598                 if (!dpsoftrast.texture[texnum].bytes)
599                         break;
600         dpsoftrast.texture_firstfree = texnum + 1;
601         if (dpsoftrast.texture_max <= texnum)
602                 DPSOFTRAST_Texture_Grow();
603         if (dpsoftrast.texture_end <= texnum)
604                 dpsoftrast.texture_end = texnum + 1;
605         texture = &dpsoftrast.texture[texnum];
606         memset(texture, 0, sizeof(*texture));
607         texture->flags = flags;
608         texture->width = width;
609         texture->height = height;
610         texture->depth = depth;
611         texture->sides = sides;
612         texture->binds = 0;
613         w = width;
614         h = height;
615         d = depth;
616         size = 0;
617         mipmaps = 0;
618         w = width;
619         h = height;
620         d = depth;
621         for (;;)
622         {
623                 s = w * h * d * sides * 4;
624                 texture->mipmap[mipmaps][0] = size;
625                 texture->mipmap[mipmaps][1] = s;
626                 texture->mipmap[mipmaps][2] = w;
627                 texture->mipmap[mipmaps][3] = h;
628                 texture->mipmap[mipmaps][4] = d;
629                 size += s;
630                 mipmaps++;
631                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
632                         break;
633                 if (w > 1) w >>= 1;
634                 if (h > 1) h >>= 1;
635                 if (d > 1) d >>= 1;
636         }
637         texture->mipmaps = mipmaps;
638         texture->size = size;
639
640         // allocate the pixels now
641         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
642
643         return texnum;
644 }
645 void DPSOFTRAST_Texture_Free(int index)
646 {
647         DPSOFTRAST_Texture *texture;
648         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
649         if (texture->binds)
650                 DPSOFTRAST_Flush();
651         if (texture->bytes)
652                 MM_FREE(texture->bytes);
653         texture->bytes = NULL;
654         memset(texture, 0, sizeof(*texture));
655         // adjust the free range and used range
656         if (dpsoftrast.texture_firstfree > index)
657                 dpsoftrast.texture_firstfree = index;
658         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
659                 dpsoftrast.texture_end--;
660 }
661 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
662 {
663         int i, x, y, z, w, layer0, layer1, row0, row1;
664         unsigned char *o, *i0, *i1, *i2, *i3;
665         DPSOFTRAST_Texture *texture;
666         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
667         if (texture->mipmaps <= 1)
668                 return;
669         for (i = 1;i < texture->mipmaps;i++)
670         {
671                 for (z = 0;z < texture->mipmap[i][4];z++)
672                 {
673                         layer0 = z*2;
674                         layer1 = z*2+1;
675                         if (layer1 >= texture->mipmap[i-1][4])
676                                 layer1 = texture->mipmap[i-1][4]-1;
677                         for (y = 0;y < texture->mipmap[i][3];y++)
678                         {
679                                 row0 = y*2;
680                                 row1 = y*2+1;
681                                 if (row1 >= texture->mipmap[i-1][3])
682                                         row1 = texture->mipmap[i-1][3]-1;
683                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
684                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
685                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
686                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
687                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
688                                 w = texture->mipmap[i][2];
689                                 if (layer1 > layer0)
690                                 {
691                                         if (texture->mipmap[i-1][2] > 1)
692                                         {
693                                                 // average 3D texture
694                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
695                                                 {
696                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
697                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
698                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
699                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
700                                                 }
701                                         }
702                                         else
703                                         {
704                                                 // average 3D mipmap with parent width == 1
705                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
706                                                 {
707                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
708                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
709                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
710                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
711                                                 }
712                                         }
713                                 }
714                                 else
715                                 {
716                                         if (texture->mipmap[i-1][2] > 1)
717                                         {
718                                                 // average 2D texture (common case)
719                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
720                                                 {
721                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
722                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
723                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
724                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
725                                                 }
726                                         }
727                                         else
728                                         {
729                                                 // 2D texture with parent width == 1
730                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
731                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
732                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
733                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
734                                         }
735                                 }
736                         }
737                 }
738         }
739 }
740 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
741 {
742         DPSOFTRAST_Texture *texture;
743         unsigned char *dst;
744         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
745         if (texture->binds)
746                 DPSOFTRAST_Flush();
747         if (pixels)
748         {
749                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
750                 while (blockheight > 0)
751                 {
752                         memcpy(dst, pixels, blockwidth * 4);
753                         pixels += blockwidth * 4;
754                         dst += texture->mipmap[0][2] * 4;
755                         blockheight--;
756                 }
757         }
758         DPSOFTRAST_Texture_CalculateMipmaps(index);
759 }
760 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
761 {
762         DPSOFTRAST_Texture *texture;
763         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
764         if (texture->binds)
765                 DPSOFTRAST_Flush();
766         if (pixels)
767                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
768         DPSOFTRAST_Texture_CalculateMipmaps(index);
769 }
770 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
771 {
772         DPSOFTRAST_Texture *texture;
773         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
774         return texture->mipmap[mip][2];
775 }
776 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
777 {
778         DPSOFTRAST_Texture *texture;
779         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
780         return texture->mipmap[mip][3];
781 }
782 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
783 {
784         DPSOFTRAST_Texture *texture;
785         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
786         return texture->mipmap[mip][4];
787 }
788 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
789 {
790         DPSOFTRAST_Texture *texture;
791         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
792         if (texture->binds)
793                 DPSOFTRAST_Flush();
794         return texture->bytes + texture->mipmap[mip][0];
795 }
796 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
797 {
798         DPSOFTRAST_Texture *texture;
799         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
800         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
801         {
802                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
803                 return;
804         }
805         if (texture->binds)
806                 DPSOFTRAST_Flush();
807         texture->filter = filter;
808 }
809
810 static void DPSOFTRAST_Draw_FlushThreads(void);
811
812 static void DPSOFTRAST_Draw_SyncCommands(void)
813 {
814         if(dpsoftrast.usethreads) MEMORY_BARRIER;
815         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
816 }
817
818 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
819 {
820         DPSOFTRAST_State_Thread *thread;
821         int i;
822         int freecommand = dpsoftrast.commandpool.freecommand;
823         int usedcommands = dpsoftrast.commandpool.usedcommands;
824         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
825                 return;
826         DPSOFTRAST_Draw_SyncCommands();
827         for(;;)
828         {
829                 int waitindex = -1;
830                 int commandoffset;
831                 usedcommands = 0;
832                 for (i = 0; i < dpsoftrast.numthreads; i++)
833                 {
834                         thread = &dpsoftrast.threads[i]; 
835                         commandoffset = freecommand - thread->commandoffset;
836                         if (commandoffset < 0)
837                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
838                         if (commandoffset > usedcommands)
839                         {
840                                 waitindex = i;
841                                 usedcommands = commandoffset;
842                         }
843                 }
844                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
845                         break;
846                 thread = &dpsoftrast.threads[waitindex];
847                 Thread_LockMutex(thread->drawmutex);
848                 if (thread->commandoffset != dpsoftrast.drawcommand)
849                 {
850                         thread->waiting = true;
851                         if (thread->starving) Thread_CondSignal(thread->drawcond);
852                         Thread_CondWait(thread->waitcond, thread->drawmutex);
853                         thread->waiting = false;
854                 }
855                 Thread_UnlockMutex(thread->drawmutex);
856         }
857         dpsoftrast.commandpool.usedcommands = usedcommands;
858 }
859
860 #define DPSOFTRAST_ALIGNCOMMAND(size) \
861         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
862 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
863         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
864
865 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
866 {
867         DPSOFTRAST_Command *command;
868         int freecommand = dpsoftrast.commandpool.freecommand;
869         int usedcommands = dpsoftrast.commandpool.usedcommands;
870         int extra = sizeof(DPSOFTRAST_Command);
871         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
872                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
873         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
874         {
875                 if (dpsoftrast.usethreads)
876                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
877                 else
878                         DPSOFTRAST_Draw_FlushThreads();
879                 freecommand = dpsoftrast.commandpool.freecommand;
880                 usedcommands = dpsoftrast.commandpool.usedcommands;
881         }
882         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
883         {
884                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
885                 command->opcode = DPSOFTRAST_OPCODE_Reset;
886                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
887                 freecommand = 0;
888         }
889         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
890         command->opcode = opcode;
891         command->commandsize = size;
892         freecommand += size;
893         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
894                 freecommand = 0;
895         dpsoftrast.commandpool.freecommand = freecommand;
896         dpsoftrast.commandpool.usedcommands = usedcommands + size;
897         return command;
898 }
899
900 static void DPSOFTRAST_UndoCommand(int size)
901 {
902         int freecommand = dpsoftrast.commandpool.freecommand;
903         int usedcommands = dpsoftrast.commandpool.usedcommands;
904         freecommand -= size;
905         if (freecommand < 0)
906                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
907         usedcommands -= size;
908         dpsoftrast.commandpool.freecommand = freecommand;
909         dpsoftrast.commandpool.usedcommands = usedcommands;
910 }
911                 
912 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
913 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
914 {
915         thread->viewport[0] = command->x;
916         thread->viewport[1] = command->y;
917         thread->viewport[2] = command->width;
918         thread->viewport[3] = command->height;
919         thread->validate |= DPSOFTRAST_VALIDATE_FB;
920 }
921 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
922 {
923         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
924         command->x = x;
925         command->y = y;
926         command->width = width;
927         command->height = height;
928
929         dpsoftrast.viewport[0] = x;
930         dpsoftrast.viewport[1] = y;
931         dpsoftrast.viewport[2] = width;
932         dpsoftrast.viewport[3] = height;
933         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
934 }
935
936 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
937 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
938 {
939         int i, x1, y1, x2, y2, w, h, x, y;
940         int miny1, maxy1, miny2, maxy2;
941         int bandy;
942         unsigned int *p;
943         unsigned int c;
944         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
945         miny1 = thread->miny1;
946         maxy1 = thread->maxy1;
947         miny2 = thread->miny2;
948         maxy2 = thread->maxy2;
949         x1 = thread->fb_scissor[0];
950         y1 = thread->fb_scissor[1];
951         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
952         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
953         if (y1 < miny1) y1 = miny1;
954         if (y2 > maxy2) y2 = maxy2;
955         w = x2 - x1;
956         h = y2 - y1;
957         if (w < 1 || h < 1)
958                 return;
959         // FIXME: honor fb_colormask?
960         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
961         for (i = 0;i < 4;i++)
962         {
963                 if (!dpsoftrast.fb_colorpixels[i])
964                         continue;
965                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
966                 for (;y < bandy;y++)
967                 {
968                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
969                         for (x = x1;x < x2;x++)
970                                 p[x] = c;
971                 }
972         }
973 }
974 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
975 {
976         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
977         command->r = r;
978         command->g = g;
979         command->b = b;
980         command->a = a;
981 }
982
983 DEFCOMMAND(3, ClearDepth, float depth;)
984 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
985 {
986         int x1, y1, x2, y2, w, h, x, y;
987         int miny1, maxy1, miny2, maxy2;
988         int bandy;
989         unsigned int *p;
990         unsigned int c;
991         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
992         miny1 = thread->miny1;
993         maxy1 = thread->maxy1;
994         miny2 = thread->miny2;
995         maxy2 = thread->maxy2;
996         x1 = thread->fb_scissor[0];
997         y1 = thread->fb_scissor[1];
998         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
999         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1000         if (y1 < miny1) y1 = miny1;
1001         if (y2 > maxy2) y2 = maxy2;
1002         w = x2 - x1;
1003         h = y2 - y1;
1004         if (w < 1 || h < 1)
1005                 return;
1006         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1007         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1008         for (;y < bandy;y++)
1009         {
1010                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1011                 for (x = x1;x < x2;x++)
1012                         p[x] = c;
1013         }
1014 }
1015 void DPSOFTRAST_ClearDepth(float d)
1016 {
1017         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1018         command->depth = d;
1019 }
1020
1021 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1022 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1023 {
1024         thread->colormask[0] = command->r != 0;
1025         thread->colormask[1] = command->g != 0;
1026         thread->colormask[2] = command->b != 0;
1027         thread->colormask[3] = command->a != 0;
1028         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1029 }
1030 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1031 {
1032         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1033         command->r = r;
1034         command->g = g;
1035         command->b = b;
1036         command->a = a;
1037 }
1038
1039 DEFCOMMAND(5, DepthTest, int enable;)
1040 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1041 {
1042         thread->depthtest = command->enable;
1043         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1044 }
1045 void DPSOFTRAST_DepthTest(int enable)
1046 {
1047         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1048         command->enable = enable;
1049 }
1050
1051 DEFCOMMAND(6, ScissorTest, int enable;)
1052 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1053 {
1054         thread->scissortest = command->enable;
1055         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1056 }
1057 void DPSOFTRAST_ScissorTest(int enable)
1058 {
1059         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1060         command->enable = enable;
1061 }
1062
1063 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1064 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1065 {
1066         thread->scissor[0] = command->x;
1067         thread->scissor[1] = command->y;
1068         thread->scissor[2] = command->width;
1069         thread->scissor[3] = command->height;
1070         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1071 }
1072 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1073 {
1074         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1075         command->x = x;
1076         command->y = y;
1077         command->width = width;
1078         command->height = height;
1079 }
1080
1081 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1082 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1083 {
1084         thread->blendfunc[0] = command->sfactor;
1085         thread->blendfunc[1] = command->dfactor;
1086         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1087 }
1088 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1089 {
1090         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1091         command->sfactor = sfactor;
1092         command->dfactor = dfactor;
1093 }
1094
1095 DEFCOMMAND(9, BlendSubtract, int enable;)
1096 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1097 {
1098         thread->blendsubtract = command->enable;
1099         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1100 }
1101 void DPSOFTRAST_BlendSubtract(int enable)
1102 {
1103         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1104         command->enable = enable;
1105 }
1106
1107 DEFCOMMAND(10, DepthMask, int enable;)
1108 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1109 {
1110         thread->depthmask = command->enable;
1111 }
1112 void DPSOFTRAST_DepthMask(int enable)
1113 {
1114         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1115         command->enable = enable;
1116 }
1117
1118 DEFCOMMAND(11, DepthFunc, int func;)
1119 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1120 {
1121         thread->depthfunc = command->func;
1122 }
1123 void DPSOFTRAST_DepthFunc(int func)
1124 {
1125         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1126         command->func = func;
1127 }
1128
1129 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1130 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1131 {
1132         thread->depthrange[0] = command->nearval;
1133         thread->depthrange[1] = command->farval;
1134 }
1135 void DPSOFTRAST_DepthRange(float nearval, float farval)
1136 {
1137         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1138         command->nearval = nearval;
1139         command->farval = farval;
1140 }
1141
1142 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1143 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1144 {
1145         thread->polygonoffset[0] = command->alongnormal;
1146         thread->polygonoffset[1] = command->intoview;
1147 }
1148 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1149 {
1150         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1151         command->alongnormal = alongnormal;
1152         command->intoview = intoview;
1153 }
1154
1155 DEFCOMMAND(14, CullFace, int mode;)
1156 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1157 {
1158         thread->cullface = command->mode;
1159 }
1160 void DPSOFTRAST_CullFace(int mode)
1161 {
1162         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1163         command->mode = mode;
1164 }
1165
1166 DEFCOMMAND(15, AlphaTest, int enable;)
1167 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1168 {
1169         thread->alphatest = command->enable;
1170 }
1171 void DPSOFTRAST_AlphaTest(int enable)
1172 {
1173         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1174         command->enable = enable;
1175 }
1176
1177 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1178 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1179 {
1180         thread->alphafunc = command->func;
1181         thread->alphavalue = command->ref;
1182 }
1183 void DPSOFTRAST_AlphaFunc(int func, float ref)
1184 {
1185         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1186         command->func = func;
1187         command->ref = ref;
1188 }
1189
1190 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1191 {
1192         dpsoftrast.color[0] = r;
1193         dpsoftrast.color[1] = g;
1194         dpsoftrast.color[2] = b;
1195         dpsoftrast.color[3] = a;
1196 }
1197
1198 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1199 {
1200         int outstride = blockwidth * 4;
1201         int instride = dpsoftrast.fb_width * 4;
1202         int bx1 = blockx;
1203         int by1 = blocky;
1204         int bx2 = blockx + blockwidth;
1205         int by2 = blocky + blockheight;
1206         int bw;
1207         int x;
1208         int y;
1209         unsigned char *inpixels;
1210         unsigned char *b;
1211         unsigned char *o;
1212         DPSOFTRAST_Flush();
1213         if (bx1 < 0) bx1 = 0;
1214         if (by1 < 0) by1 = 0;
1215         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1216         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1217         bw = bx2 - bx1;
1218         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1219         if (dpsoftrast.bigendian)
1220         {
1221                 for (y = by1;y < by2;y++)
1222                 {
1223                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1224                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1225                         for (x = bx1;x < bx2;x++)
1226                         {
1227                                 o[0] = b[3];
1228                                 o[1] = b[2];
1229                                 o[2] = b[1];
1230                                 o[3] = b[0];
1231                                 o += 4;
1232                                 b += 4;
1233                         }
1234                 }
1235         }
1236         else
1237         {
1238                 for (y = by1;y < by2;y++)
1239                 {
1240                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1241                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1242                         memcpy(o, b, bw*4);
1243                 }
1244         }
1245
1246 }
1247 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1248 {
1249         int tx1 = tx;
1250         int ty1 = ty;
1251         int tx2 = tx + width;
1252         int ty2 = ty + height;
1253         int sx1 = sx;
1254         int sy1 = sy;
1255         int sx2 = sx + width;
1256         int sy2 = sy + height;
1257         int swidth;
1258         int sheight;
1259         int twidth;
1260         int theight;
1261         int sw;
1262         int sh;
1263         int tw;
1264         int th;
1265         int y;
1266         unsigned int *spixels;
1267         unsigned int *tpixels;
1268         DPSOFTRAST_Texture *texture;
1269         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1270         if (mip < 0 || mip >= texture->mipmaps) return;
1271         DPSOFTRAST_Flush();
1272         spixels = dpsoftrast.fb_colorpixels[0];
1273         swidth = dpsoftrast.fb_width;
1274         sheight = dpsoftrast.fb_height;
1275         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1276         twidth = texture->mipmap[mip][2];
1277         theight = texture->mipmap[mip][3];
1278         if (tx1 < 0) tx1 = 0;
1279         if (ty1 < 0) ty1 = 0;
1280         if (tx2 > twidth) tx2 = twidth;
1281         if (ty2 > theight) ty2 = theight;
1282         if (sx1 < 0) sx1 = 0;
1283         if (sy1 < 0) sy1 = 0;
1284         if (sx2 > swidth) sx2 = swidth;
1285         if (sy2 > sheight) sy2 = sheight;
1286         tw = tx2 - tx1;
1287         th = ty2 - ty1;
1288         sw = sx2 - sx1;
1289         sh = sy2 - sy1;
1290         if (tw > sw) tw = sw;
1291         if (th > sh) th = sh;
1292         if (tw < 1 || th < 1)
1293                 return;
1294         sy1 = sheight - 1 - sy1;
1295         for (y = 0;y < th;y++)
1296                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1297         if (texture->mipmaps > 1)
1298                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1299 }
1300
1301 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1302 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1303 {
1304         if (thread->texbound[command->unitnum])
1305                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1306         thread->texbound[command->unitnum] = command->texture;
1307 }
1308 void DPSOFTRAST_SetTexture(int unitnum, int index)
1309 {
1310         DPSOFTRAST_Command_SetTexture *command;
1311         DPSOFTRAST_Texture *texture;
1312         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1313         {
1314                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1315                 return;
1316         }
1317         texture = DPSOFTRAST_Texture_GetByIndex(index);
1318         if (index && !texture)
1319         {
1320                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1321                 return;
1322         }
1323
1324         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1325         command->unitnum = unitnum;
1326         command->texture = texture;
1327
1328         dpsoftrast.texbound[unitnum] = texture;
1329         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1330 }
1331
1332 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1333 {
1334         dpsoftrast.pointer_vertex3f = vertex3f;
1335         dpsoftrast.stride_vertex = stride;
1336 }
1337 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1338 {
1339         dpsoftrast.pointer_color4f = color4f;
1340         dpsoftrast.pointer_color4ub = NULL;
1341         dpsoftrast.stride_color = stride;
1342 }
1343 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1344 {
1345         dpsoftrast.pointer_color4f = NULL;
1346         dpsoftrast.pointer_color4ub = color4ub;
1347         dpsoftrast.stride_color = stride;
1348 }
1349 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1350 {
1351         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1352         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1353         dpsoftrast.stride_texcoord[unitnum] = stride;
1354 }
1355
1356 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1357 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1358 {
1359         thread->shader_mode = command->mode;
1360         thread->shader_permutation = command->permutation;
1361         thread->shader_exactspecularmath = command->exactspecularmath;
1362 }
1363 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1364 {
1365         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1366         command->mode = mode;
1367         command->permutation = permutation;
1368         command->exactspecularmath = exactspecularmath;
1369
1370         dpsoftrast.shader_mode = mode;
1371         dpsoftrast.shader_permutation = permutation;
1372         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1373 }
1374
1375 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1376 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1377 {
1378         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1379 }
1380 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1381 {
1382         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1383         command->index = index;
1384         command->val[0] = v0;
1385         command->val[1] = v1;
1386         command->val[2] = v2;
1387         command->val[3] = v3;
1388
1389         dpsoftrast.uniform4f[index*4+0] = v0;
1390         dpsoftrast.uniform4f[index*4+1] = v1;
1391         dpsoftrast.uniform4f[index*4+2] = v2;
1392         dpsoftrast.uniform4f[index*4+3] = v3;
1393 }
1394 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1395 {
1396         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1397         command->index = index;
1398         memcpy(command->val, v, sizeof(command->val));
1399
1400         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1401 }
1402
1403 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1404 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1405 {
1406         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1407 }
1408 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1409 {
1410 #ifdef SSE_POSSIBLE
1411         int i, index;
1412         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1413         {
1414                 __m128 m0, m1, m2, m3;
1415                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1416                 command->index = (DPSOFTRAST_UNIFORM)index;
1417                 if (((size_t)v)&(ALIGN_SIZE-1))
1418                 {
1419                         m0 = _mm_loadu_ps(v);
1420                         m1 = _mm_loadu_ps(v+4);
1421                         m2 = _mm_loadu_ps(v+8);
1422                         m3 = _mm_loadu_ps(v+12);
1423                 }
1424                 else
1425                 {
1426                         m0 = _mm_load_ps(v);
1427                         m1 = _mm_load_ps(v+4);
1428                         m2 = _mm_load_ps(v+8);
1429                         m3 = _mm_load_ps(v+12);
1430                 }
1431                 if (transpose)
1432                 {
1433                         __m128 t0, t1, t2, t3;
1434                         t0 = _mm_unpacklo_ps(m0, m1);
1435                         t1 = _mm_unpacklo_ps(m2, m3);
1436                         t2 = _mm_unpackhi_ps(m0, m1);
1437                         t3 = _mm_unpackhi_ps(m2, m3);
1438                         m0 = _mm_movelh_ps(t0, t1);
1439                         m1 = _mm_movehl_ps(t1, t0);
1440                         m2 = _mm_movelh_ps(t2, t3);
1441                         m3 = _mm_movehl_ps(t3, t2);                     
1442                 }
1443                 _mm_store_ps(command->val, m0);
1444                 _mm_store_ps(command->val+4, m1);
1445                 _mm_store_ps(command->val+8, m2);
1446                 _mm_store_ps(command->val+12, m3);
1447                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1448                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1449                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1450                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1451         }
1452 #endif
1453 }
1454
1455 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1456 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1457 {
1458         thread->uniform1i[command->index] = command->val;
1459 }
1460 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1461 {
1462         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1463         command->index = index;
1464         command->val = i0;
1465
1466         dpsoftrast.uniform1i[command->index] = i0;
1467 }
1468
1469 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1470 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1471 {
1472         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1473         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1474 }
1475 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1476 {
1477         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1478         command->clipplane[0] = x;
1479         command->clipplane[1] = y;
1480         command->clipplane[2] = z;
1481         command->clipplane[3] = w;
1482 }
1483
1484 #ifdef SSE_POSSIBLE
1485 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1486 {
1487         float *end = dst + size*4;
1488         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1489         {
1490                 while (dst < end)
1491                 {
1492                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1493                         dst += 4;
1494                         src += stride;
1495                 }
1496         }
1497         else
1498         {
1499                 while (dst < end)
1500                 {
1501                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1502                         dst += 4;
1503                         src += stride;
1504                 }
1505         }
1506 }
1507
1508 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1509 {
1510         float *end = dst + size*4;
1511         if (stride == sizeof(float[3]))
1512         {
1513                 float *end4 = dst + (size&~3)*4;        
1514                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1515                 {
1516                         while (dst < end4)
1517                         {
1518                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1519                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1520                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1521                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1522                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1523                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1524                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1525                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1526                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1527                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1528                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1529                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1530                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1531                                 dst += 16;
1532                                 src += 4*sizeof(float[3]);
1533                         }
1534                 }
1535                 else
1536                 {
1537                         while (dst < end4)
1538                         {
1539                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1540                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1541                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1542                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1543                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1544                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1545                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1546                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1547                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1548                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1549                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1550                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1551                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1552                                 dst += 16;
1553                                 src += 4*sizeof(float[3]);
1554                         }
1555                 }
1556         }
1557         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1558         {
1559                 while (dst < end)
1560                 {
1561                         __m128 v = _mm_loadu_ps((const float *)src);
1562                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1563                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1564                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1565                         _mm_store_ps(dst, v);
1566                         dst += 4;
1567                         src += stride;
1568                 }
1569         }
1570         else
1571         {
1572                 while (dst < end)
1573                 {
1574                         __m128 v = _mm_load_ps((const float *)src);
1575                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1576                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1577                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1578                         _mm_store_ps(dst, v);
1579                         dst += 4;
1580                         src += stride;
1581                 }
1582         }
1583 }
1584
1585 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1586 {
1587         float *end = dst + size*4;
1588         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1589         if (stride == sizeof(float[2]))
1590         {
1591                 float *end2 = dst + (size&~1)*4;
1592                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1593                 {
1594                         while (dst < end2)
1595                         {
1596                                 __m128 v = _mm_loadu_ps((const float *)src);
1597                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1598                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1599                                 dst += 8;
1600                                 src += 2*sizeof(float[2]);
1601                         }
1602                 }
1603                 else
1604                 {
1605                         while (dst < end2)
1606                         {
1607                                 __m128 v = _mm_load_ps((const float *)src);
1608                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1609                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1610                                 dst += 8;
1611                                 src += 2*sizeof(float[2]);
1612                         }
1613                 }
1614         }
1615         while (dst < end)
1616         {
1617                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1618                 dst += 4;
1619                 src += stride;
1620         }
1621 }
1622
1623 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1624 {
1625         float *end = dst + size*4;
1626         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1627         if (stride == sizeof(unsigned char[4]))
1628         {
1629                 float *end4 = dst + (size&~3)*4;
1630                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1631                 {
1632                         while (dst < end4)
1633                         {
1634                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1635                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1636                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1637                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1638                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1639                                 dst += 16;
1640                                 src += 4*sizeof(unsigned char[4]);
1641                         }
1642                 }
1643                 else
1644                 {
1645                         while (dst < end4)
1646                         {
1647                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1648                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1649                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1650                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1651                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1652                                 dst += 16;
1653                                 src += 4*sizeof(unsigned char[4]);
1654                         }
1655                 }
1656         }
1657         while (dst < end)
1658         {
1659                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1660                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1661                 dst += 4;
1662                 src += stride;
1663         }
1664 }
1665
1666 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1667 {
1668         float *end = dst + 4*size;
1669         __m128 v = _mm_loadu_ps(src);
1670         while (dst < end)
1671         {
1672                 _mm_store_ps(dst, v);
1673                 dst += 4;
1674         }
1675 }
1676 #endif
1677
1678 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1679 {
1680 #ifdef SSE_POSSIBLE
1681         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1682         __m128 m0, m1, m2, m3;
1683         float *end;
1684         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1685         {
1686                 // fast case for identity matrix
1687                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1688                 return;
1689         }
1690         end = out4f + numitems*4;
1691         m0 = _mm_loadu_ps(inmatrix16f);
1692         m1 = _mm_loadu_ps(inmatrix16f + 4);
1693         m2 = _mm_loadu_ps(inmatrix16f + 8);
1694         m3 = _mm_loadu_ps(inmatrix16f + 12);
1695         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1696         {
1697                 while (out4f < end)
1698                 {
1699                         __m128 v = _mm_loadu_ps(in4f);
1700                         _mm_store_ps(out4f,
1701                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1702                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1703                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1704                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1705                         out4f += 4;
1706                         in4f += 4;
1707                 }
1708         }
1709         else
1710         {
1711                 while (out4f < end)
1712                 {
1713                         __m128 v = _mm_load_ps(in4f);
1714                         _mm_store_ps(out4f,
1715                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1716                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1717                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1718                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1719                         out4f += 4;
1720                         in4f += 4;
1721                 }
1722         }
1723 #endif
1724 }
1725
1726 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1727 {
1728         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1729 }
1730
1731 #ifdef SSE_POSSIBLE
1732 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1733 { \
1734         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1735         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1736         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1737         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1738 }
1739
1740 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1741 { \
1742         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1743         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1744         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1745         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1746 }
1747
1748 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1749 { \
1750         __m128 p = (in); \
1751         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1752                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1753                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1754                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1755 }
1756
1757 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1758 {
1759         int clipmask = 0xFF;
1760         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1761         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1762         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1763         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1764         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1765         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1766         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1767         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1768         #define BBFRONT(k, pos) \
1769         { \
1770                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1771                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1772                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1773                 { \
1774                         __m128 proj; \
1775                         clipmask &= ~(1<<k); \
1776                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1777                         minproj = _mm_min_ss(minproj, proj); \
1778                         maxproj = _mm_max_ss(maxproj, proj); \
1779                 } \
1780         }
1781         BBFRONT(0, minpos); 
1782         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1783         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1784         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1785         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1786         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1787         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1788         BBFRONT(7, maxpos);
1789         #define BBCLIP(k) \
1790         { \
1791                 if (clipmask&(1<<k)) \
1792                 { \
1793                         if (!(clipmask&(1<<(k^1)))) \
1794                         { \
1795                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1796                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1797                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1798                                 minproj = _mm_min_ss(minproj, proj); \
1799                                 maxproj = _mm_max_ss(maxproj, proj); \
1800                         } \
1801                         if (!(clipmask&(1<<(k^2)))) \
1802                         { \
1803                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1804                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1805                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1806                                 minproj = _mm_min_ss(minproj, proj); \
1807                                 maxproj = _mm_max_ss(maxproj, proj); \
1808                         } \
1809                         if (!(clipmask&(1<<(k^4)))) \
1810                         { \
1811                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1812                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1813                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1814                                 minproj = _mm_min_ss(minproj, proj); \
1815                                 maxproj = _mm_max_ss(maxproj, proj); \
1816                         } \
1817                 } \
1818         }
1819         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1820         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1821         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1822         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1823         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1824         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1825         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1826         *starty = _mm_cvttss_si32(maxproj);
1827         *endy = _mm_cvttss_si32(minproj)+1;
1828         return clipmask;
1829 }
1830         
1831 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1832 {
1833         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1834         float *end = out4f + numitems*4;
1835         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1836         __m128 minpos, maxpos;
1837         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1838         {
1839                 minpos = maxpos = _mm_loadu_ps(in4f);
1840                 while (out4f < end)
1841                 {
1842                         __m128 v = _mm_loadu_ps(in4f);
1843                         minpos = _mm_min_ps(minpos, v);
1844                         maxpos = _mm_max_ps(maxpos, v);
1845                         _mm_store_ps(out4f, v);
1846                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1847                         _mm_store_ps(screen4f, v);
1848                         in4f += 4;
1849                         out4f += 4;
1850                         screen4f += 4;
1851                 }
1852         }
1853         else
1854         {
1855                 minpos = maxpos = _mm_load_ps(in4f);
1856                 while (out4f < end)
1857                 {
1858                         __m128 v = _mm_load_ps(in4f);
1859                         minpos = _mm_min_ps(minpos, v);
1860                         maxpos = _mm_max_ps(maxpos, v);
1861                         _mm_store_ps(out4f, v);
1862                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1863                         _mm_store_ps(screen4f, v);
1864                         in4f += 4;
1865                         out4f += 4;
1866                         screen4f += 4;
1867                 }
1868         }
1869         if (starty && endy) 
1870         {
1871                 ALIGN(float minposf[4]);
1872                 ALIGN(float maxposf[4]);
1873                 _mm_store_ps(minposf, minpos);
1874                 _mm_store_ps(maxposf, maxpos);
1875                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1876         }
1877         return 0;
1878 }
1879
1880 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1881 {
1882         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1883         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1884         float *end;
1885         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1886                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1887         end = out4f + numitems*4;
1888         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1889         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1890         m0 = _mm_loadu_ps(inmatrix16f);
1891         m1 = _mm_loadu_ps(inmatrix16f + 4);
1892         m2 = _mm_loadu_ps(inmatrix16f + 8);
1893         m3 = _mm_loadu_ps(inmatrix16f + 12);
1894         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1895         {
1896                 minpos = maxpos = _mm_loadu_ps(in4f);
1897                 while (out4f < end)
1898                 {
1899                         __m128 v = _mm_loadu_ps(in4f);
1900                         minpos = _mm_min_ps(minpos, v);
1901                         maxpos = _mm_max_ps(maxpos, v);
1902                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1903                         _mm_store_ps(out4f, v);
1904                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1905                         _mm_store_ps(screen4f, v);
1906                         in4f += 4;
1907                         out4f += 4;
1908                         screen4f += 4;
1909                 }
1910         }
1911         else
1912         {
1913                 minpos = maxpos = _mm_load_ps(in4f);
1914                 while (out4f < end)
1915                 {
1916                         __m128 v = _mm_load_ps(in4f);
1917                         minpos = _mm_min_ps(minpos, v);
1918                         maxpos = _mm_max_ps(maxpos, v);
1919                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1920                         _mm_store_ps(out4f, v);
1921                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1922                         _mm_store_ps(screen4f, v);
1923                         in4f += 4;
1924                         out4f += 4;
1925                         screen4f += 4;
1926                 }
1927         }
1928         if (starty && endy) 
1929         {
1930                 ALIGN(float minposf[4]);
1931                 ALIGN(float maxposf[4]);
1932                 _mm_store_ps(minposf, minpos);
1933                 _mm_store_ps(maxposf, maxpos);
1934                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1935         }
1936         return 0;
1937 }
1938 #endif
1939
1940 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1941 {
1942 #ifdef SSE_POSSIBLE
1943         float *outf = dpsoftrast.post_array4f[outarray];
1944         const unsigned char *inb;
1945         int firstvertex = dpsoftrast.firstvertex;
1946         int numvertices = dpsoftrast.numvertices;
1947         int stride;
1948         switch(inarray)
1949         {
1950         case DPSOFTRAST_ARRAY_POSITION:
1951                 stride = dpsoftrast.stride_vertex;
1952                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1953                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1954                 break;
1955         case DPSOFTRAST_ARRAY_COLOR:
1956                 stride = dpsoftrast.stride_color;
1957                 if (dpsoftrast.pointer_color4f)
1958                 {
1959                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1960                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1961                 }
1962                 else if (dpsoftrast.pointer_color4ub)
1963                 {
1964                         stride = dpsoftrast.stride_color;
1965                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1966                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1967                 }
1968                 else
1969                 {
1970                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1971                 }
1972                 break;
1973         default:
1974                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1975                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1976                 {
1977                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1978                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1979                         {
1980                         case 2:
1981                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1982                                 break;
1983                         case 3:
1984                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1985                                 break;
1986                         case 4:
1987                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1988                                 break;
1989                         }
1990                 }
1991                 break;
1992         }
1993         return outf;
1994 #else
1995         return NULL;
1996 #endif
1997 }
1998
1999 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
2000 {
2001         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
2003         return data;
2004 }
2005
2006 #if 0
2007 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
2008 {
2009 #ifdef SSE_POSSIBLE
2010         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2011         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2012         return data;
2013 #else
2014         return NULL;
2015 #endif
2016 }
2017 #endif
2018
2019 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2020 {
2021 #ifdef SSE_POSSIBLE
2022         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2023         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2024         return data;
2025 #else
2026         return NULL;
2027 #endif
2028 }
2029
2030 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2031 {
2032         int x;
2033         int startx = span->startx;
2034         int endx = span->endx;
2035         float wslope = triangle->w[0];
2036         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2037         float endz = 1.0f / (w + wslope * startx);
2038         if (triangle->w[0] == 0)
2039         {
2040                 // LordHavoc: fast flat polygons (HUD/menu)
2041                 for (x = startx;x < endx;x++)
2042                         zf[x] = endz;
2043                 return;
2044         }
2045         for (x = startx;x < endx;)
2046         {
2047                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2048                 float z = endz, dz;
2049                 if (nextsub >= endx) nextsub = endsub = endx-1;
2050                 endz = 1.0f / (w + wslope * nextsub);
2051                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2052                 for (; x <= endsub; x++, z += dz)
2053                         zf[x] = z;
2054         }
2055 }
2056
2057 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2058 {
2059 #ifdef SSE_POSSIBLE
2060         int x;
2061         int startx = span->startx;
2062         int endx = span->endx;
2063         int maskx;
2064         int subx;
2065         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2066         unsigned char * RESTRICT pixelmask = span->pixelmask;
2067         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2068         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2069         if (!pixel)
2070                 return;
2071         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2072         pixeli += span->y * dpsoftrast.fb_width + span->x;
2073         // handle alphatest now (this affects depth writes too)
2074         if (thread->alphatest)
2075                 for (x = startx;x < endx;x++)
2076                         if (in4ub[x*4+3] < 128)
2077                                 pixelmask[x] = false;
2078         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2079         // helps sprites, text and hud artwork
2080         switch(thread->fb_blendmode)
2081         {
2082         case DPSOFTRAST_BLENDMODE_ALPHA:
2083         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2084         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2085                 maskx = startx;
2086                 for (x = startx;x < endx;x++)
2087                 {
2088                         if (in4ub[x*4+3] >= 1)
2089                         {
2090                                 startx = x;
2091                                 for (;;)
2092                                 {
2093                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2094                                         maskx = x;
2095                                         if (x >= endx) break;
2096                                         ++x;
2097                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2098                                         if (x >= endx) break;
2099                                 }
2100                                 break;
2101                         }
2102                 }
2103                 endx = maskx;
2104                 break;
2105         case DPSOFTRAST_BLENDMODE_OPAQUE:
2106         case DPSOFTRAST_BLENDMODE_ADD:
2107         case DPSOFTRAST_BLENDMODE_INVMOD:
2108         case DPSOFTRAST_BLENDMODE_MUL:
2109         case DPSOFTRAST_BLENDMODE_MUL2:
2110         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2111         case DPSOFTRAST_BLENDMODE_INVADD:
2112                 break;
2113         }
2114         // put some special values at the end of the mask to ensure the loops end
2115         pixelmask[endx] = 1;
2116         pixelmask[endx+1] = 0;
2117         // LordHavoc: use a double loop to identify subspans, this helps the
2118         // optimized copy/blend loops to perform at their best, most triangles
2119         // have only one run of pixels, and do the search using wide reads...
2120         x = startx;
2121         while (x < endx)
2122         {
2123                 // if this pixel is masked off, it's probably not alone...
2124                 if (!pixelmask[x])
2125                 {
2126                         x++;
2127 #if 1
2128                         if (x + 8 < endx)
2129                         {
2130                                 // the 4-item search must be aligned or else it stalls badly
2131                                 if ((x & 3) && !pixelmask[x]) 
2132                                 {
2133                                         if(pixelmask[x]) goto endmasked;
2134                                         x++;
2135                                         if (x & 3)
2136                                         {
2137                                                 if(pixelmask[x]) goto endmasked;
2138                                                 x++;
2139                                                 if (x & 3)
2140                                                 {
2141                                                         if(pixelmask[x]) goto endmasked;
2142                                                         x++;
2143                                                 }
2144                                         }
2145                                 }
2146                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2147                                         x += 4;
2148                         }
2149 #endif
2150                         for (;!pixelmask[x];x++)
2151                                 ;
2152                         // rather than continue the loop, just check the end variable
2153                         if (x >= endx)
2154                                 break;
2155                 }
2156         endmasked:
2157                 // find length of subspan
2158                 subx = x + 1;
2159 #if 1
2160                 if (subx + 8 < endx)
2161                 {
2162                         if (subx & 3)
2163                         {
2164                                 if(!pixelmask[subx]) goto endunmasked;
2165                                 subx++;
2166                                 if (subx & 3)
2167                                 {
2168                                         if(!pixelmask[subx]) goto endunmasked;
2169                                         subx++;
2170                                         if (subx & 3)
2171                                         {
2172                                                 if(!pixelmask[subx]) goto endunmasked;
2173                                                 subx++;
2174                                         }
2175                                 }
2176                         }
2177                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2178                                 subx += 4;
2179                 }
2180 #endif
2181                 for (;pixelmask[subx];subx++)
2182                         ;
2183                 // the checks can overshoot, so make sure to clip it...
2184                 if (subx > endx)
2185                         subx = endx;
2186         endunmasked:
2187                 // now that we know the subspan length...  process!
2188                 switch(thread->fb_blendmode)
2189                 {
2190                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2191 #if 0
2192                         if (subx - x >= 16)
2193                         {
2194                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2195                                 x = subx;
2196                         }
2197                         else
2198 #elif 1
2199                         while (x + 16 <= subx)
2200                         {
2201                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2202                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2203                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2204                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2205                                 x += 16;
2206                         }
2207 #endif
2208                         {
2209                                 while (x + 4 <= subx)
2210                                 {
2211                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2212                                         x += 4;
2213                                 }
2214                                 if (x + 2 <= subx)
2215                                 {
2216                                         pixeli[x] = ini[x];
2217                                         pixeli[x+1] = ini[x+1];
2218                                         x += 2;
2219                                 }
2220                                 if (x < subx)
2221                                 {
2222                                         pixeli[x] = ini[x];
2223                                         x++;
2224                                 }
2225                         }
2226                         break;
2227                 case DPSOFTRAST_BLENDMODE_ALPHA:
2228                 #define FINISHBLEND(blend2, blend1) \
2229                         for (;x + 1 < subx;x += 2) \
2230                         { \
2231                                 __m128i src, dst; \
2232                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2233                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2234                                 blend2; \
2235                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2236                         } \
2237                         if (x < subx) \
2238                         { \
2239                                 __m128i src, dst; \
2240                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2241                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2242                                 blend1; \
2243                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2244                                 x++; \
2245                         }
2246                         FINISHBLEND({
2247                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2248                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2249                         }, {
2250                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2251                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2252                         });
2253                         break;
2254                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2255                         FINISHBLEND({
2256                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2257                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2258                         }, {
2259                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2260                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2261                         });
2262                         break;
2263                 case DPSOFTRAST_BLENDMODE_ADD:
2264                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2265                         break;
2266                 case DPSOFTRAST_BLENDMODE_INVMOD:
2267                         FINISHBLEND({
2268                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2269                         }, {
2270                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2271                         });
2272                         break;
2273                 case DPSOFTRAST_BLENDMODE_MUL:
2274                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2275                         break;
2276                 case DPSOFTRAST_BLENDMODE_MUL2:
2277                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2278                         break;
2279                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2280                         FINISHBLEND({
2281                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2282                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2283                         }, {
2284                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2285                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2286                         });
2287                         break;
2288                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2289                         FINISHBLEND({
2290                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2291                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2292                         }, {
2293                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2294                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2295                         });
2296                         break;
2297                 case DPSOFTRAST_BLENDMODE_INVADD:
2298                         FINISHBLEND({
2299                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2300                         }, {
2301                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2302                         });
2303                         break;
2304                 }
2305         }
2306 #endif
2307 }
2308
2309 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2310         // warning: this is SLOW, only use if the optimized per-span functions won't do
2311 {
2312         const unsigned char * RESTRICT pixelbase;
2313         const unsigned char * RESTRICT pixel[4];
2314         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2315         int wrapmask[2] = { width-1, height-1 };
2316         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2317         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2318         {
2319                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2320                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2321                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2322                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2323                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2324                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2325                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2326                 {
2327                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2328                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2329                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2330                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2331                 }
2332                 else
2333                 {
2334                         tci[0] &= wrapmask[0];
2335                         tci[1] &= wrapmask[1];
2336                         tci1[0] &= wrapmask[0];
2337                         tci1[1] &= wrapmask[1];
2338                 }
2339                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2340                 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2341                 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2342                 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2343                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2344                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2345                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2346                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2347         }
2348         else
2349         {
2350                 int tci[2] = { x * width, y * height };
2351                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2352                 {
2353                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2354                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2355                 }
2356                 else
2357                 {
2358                         tci[0] &= wrapmask[0];
2359                         tci[1] &= wrapmask[1];
2360                 }
2361                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2362                 c[0] = pixel[0][0];
2363                 c[1] = pixel[0][1];
2364                 c[2] = pixel[0][2];
2365                 c[3] = pixel[0][3];
2366         }
2367 }
2368
2369 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2370 {
2371         int x;
2372         int startx = span->startx;
2373         int endx = span->endx;
2374         int flags;
2375         float c[4];
2376         float data[4];
2377         float slope[4];
2378         float tc[2], endtc[2];
2379         float tcscale[2];
2380         unsigned int tci[2];
2381         unsigned int tci1[2];
2382         unsigned int tcimin[2];
2383         unsigned int tcimax[2];
2384         int tciwrapmask[2];
2385         int tciwidth;
2386         int filter;
2387         int mip;
2388         const unsigned char * RESTRICT pixelbase;
2389         const unsigned char * RESTRICT pixel[4];
2390         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2391         // if no texture is bound, just fill it with white
2392         if (!texture)
2393         {
2394                 for (x = startx;x < endx;x++)
2395                 {
2396                         out4f[x*4+0] = 1.0f;
2397                         out4f[x*4+1] = 1.0f;
2398                         out4f[x*4+2] = 1.0f;
2399                         out4f[x*4+3] = 1.0f;
2400                 }
2401                 return;
2402         }
2403         mip = triangle->mip[texunitindex];
2404         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2405         // if this mipmap of the texture is 1 pixel, just fill it with that color
2406         if (texture->mipmap[mip][1] == 4)
2407         {
2408                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2409                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2410                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2411                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2412                 for (x = startx;x < endx;x++)
2413                 {
2414                         out4f[x*4+0] = c[0];
2415                         out4f[x*4+1] = c[1];
2416                         out4f[x*4+2] = c[2];
2417                         out4f[x*4+3] = c[3];
2418                 }
2419                 return;
2420         }
2421         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2422         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2423         flags = texture->flags;
2424         tcscale[0] = texture->mipmap[mip][2];
2425         tcscale[1] = texture->mipmap[mip][3];
2426         tciwidth = texture->mipmap[mip][2];
2427         tcimin[0] = 0;
2428         tcimin[1] = 0;
2429         tcimax[0] = texture->mipmap[mip][2]-1;
2430         tcimax[1] = texture->mipmap[mip][3]-1;
2431         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2432         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2433         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2434         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2435         if (filter)
2436         {
2437                 endtc[0] -= 0.5f;
2438                 endtc[1] -= 0.5f;
2439         }
2440         for (x = startx;x < endx;)
2441         {
2442                 unsigned int subtc[2];
2443                 unsigned int substep[2];
2444                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2445                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2446                 if (nextsub >= endx)
2447                 {
2448                         nextsub = endsub = endx-1;      
2449                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2450                 }
2451                 tc[0] = endtc[0];
2452                 tc[1] = endtc[1];
2453                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2454                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2455                 if (filter)
2456                 {
2457                         endtc[0] -= 0.5f;
2458                         endtc[1] -= 0.5f;
2459                 }
2460                 substep[0] = (endtc[0] - tc[0]) * subscale;
2461                 substep[1] = (endtc[1] - tc[1]) * subscale;
2462                 subtc[0] = tc[0] * (1<<12);
2463                 subtc[1] = tc[1] * (1<<12);
2464                 if (filter)
2465                 {
2466                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2467                         {
2468                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2469                                 {
2470                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2471                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2472                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2473                                         tci[0] = subtc[0]>>12;
2474                                         tci[1] = subtc[1]>>12;
2475                                         tci1[0] = tci[0] + 1;
2476                                         tci1[1] = tci[1] + 1;
2477                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2478                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2479                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2480                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2481                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2482                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2483                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2484                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2485                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2486                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2487                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2488                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2489                                         out4f[x*4+0] = c[0];
2490                                         out4f[x*4+1] = c[1];
2491                                         out4f[x*4+2] = c[2];
2492                                         out4f[x*4+3] = c[3];
2493                                 }
2494                         }
2495                         else
2496                         {
2497                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2498                                 {
2499                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2500                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2501                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2502                                         tci[0] = subtc[0]>>12;
2503                                         tci[1] = subtc[1]>>12;
2504                                         tci1[0] = tci[0] + 1;
2505                                         tci1[1] = tci[1] + 1;
2506                                         tci[0] &= tciwrapmask[0];
2507                                         tci[1] &= tciwrapmask[1];
2508                                         tci1[0] &= tciwrapmask[0];
2509                                         tci1[1] &= tciwrapmask[1];
2510                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2511                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2512                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2513                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2514                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2515                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2516                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2517                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2518                                         out4f[x*4+0] = c[0];
2519                                         out4f[x*4+1] = c[1];
2520                                         out4f[x*4+2] = c[2];
2521                                         out4f[x*4+3] = c[3];
2522                                 }
2523                         }
2524                 }
2525                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2526                 {
2527                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2528                         {
2529                                 tci[0] = subtc[0]>>12;
2530                                 tci[1] = subtc[1]>>12;
2531                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2532                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2533                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2534                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2535                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2536                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2537                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2538                                 out4f[x*4+0] = c[0];
2539                                 out4f[x*4+1] = c[1];
2540                                 out4f[x*4+2] = c[2];
2541                                 out4f[x*4+3] = c[3];
2542                         }
2543                 }
2544                 else
2545                 {
2546                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2547                         {
2548                                 tci[0] = subtc[0]>>12;
2549                                 tci[1] = subtc[1]>>12;
2550                                 tci[0] &= tciwrapmask[0];
2551                                 tci[1] &= tciwrapmask[1];
2552                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2553                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2554                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2555                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2556                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2557                                 out4f[x*4+0] = c[0];
2558                                 out4f[x*4+1] = c[1];
2559                                 out4f[x*4+2] = c[2];
2560                                 out4f[x*4+3] = c[3];
2561                         }
2562                 }
2563         }
2564 }
2565
2566 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2567 {
2568 #ifdef SSE_POSSIBLE
2569         int x;
2570         int startx = span->startx;
2571         int endx = span->endx;
2572         int flags;
2573         __m128 data, slope, tcscale;
2574         __m128i tcsize, tcmask, tcoffset, tcmax;
2575         __m128 tc, endtc;
2576         __m128i subtc, substep, endsubtc;
2577         int filter;
2578         int mip;
2579         int affine; // LordHavoc: optimized affine texturing case
2580         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2581         const unsigned char * RESTRICT pixelbase;
2582         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2583         // if no texture is bound, just fill it with white
2584         if (!texture)
2585         {
2586                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2587                 return;
2588         }
2589         mip = triangle->mip[texunitindex];
2590         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2591         // if this mipmap of the texture is 1 pixel, just fill it with that color
2592         if (texture->mipmap[mip][1] == 4)
2593         {
2594                 unsigned int k = *((const unsigned int *)pixelbase);
2595                 for (x = startx;x < endx;x++)
2596                         outi[x] = k;
2597                 return;
2598         }
2599         affine = zf[startx] == zf[endx-1];
2600         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2601         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2602         flags = texture->flags;
2603         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2604         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2605         tcscale = _mm_cvtepi32_ps(tcsize);
2606         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2607         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2608         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2609         if (filter)
2610                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2611         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2612         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2613         tcmax = _mm_packs_epi32(tcmask, tcmask);
2614         for (x = startx;x < endx;)
2615         {
2616                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2617                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2618                 if (nextsub >= endx || affine)
2619                 {
2620                         nextsub = endsub = endx-1;
2621                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2622                 }       
2623                 tc = endtc;
2624                 subtc = endsubtc;
2625                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2626                 if (filter)
2627                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2628                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2629                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2630                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2631                 substep = _mm_slli_epi32(substep, 1);
2632                 if (filter)
2633                 {
2634                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2635                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2636                         {
2637                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2638                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2639                                 {
2640                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2641                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2642                                         tci = _mm_madd_epi16(tci, tcoffset);
2643                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2644                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2645                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2646                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2647                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2648                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2649                                         fracm = _mm_srli_epi16(subtc, 1);
2650                                         pix1 = _mm_add_epi16(pix1,
2651                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2652                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2653                                         pix3 = _mm_add_epi16(pix3,
2654                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2655                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2656                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2657                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2658                                         pix2 = _mm_add_epi16(pix2,
2659                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2660                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2661                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2662                                 }
2663                                 if (x <= endsub)
2664                                 {
2665                                         const unsigned char * RESTRICT ptr1;
2666                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2667                                         tci = _mm_madd_epi16(tci, tcoffset);
2668                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2669                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2670                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2671                                         fracm = _mm_srli_epi16(subtc, 1);
2672                                         pix1 = _mm_add_epi16(pix1,
2673                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2674                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2675                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2676                                         pix1 = _mm_add_epi16(pix1,
2677                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2678                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2679                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2680                                         x++;
2681                                 }
2682                         }
2683                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2684                         {
2685                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2686                                 {
2687                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2688                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2689                                         tci = _mm_madd_epi16(tci, tcoffset);
2690                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2691                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2692                                                                                         _mm_setzero_si128());
2693                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2694                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2695                                                                                         _mm_setzero_si128());
2696                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2697                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2698                                         tci = _mm_madd_epi16(tci, tcoffset);
2699                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2700                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2701                                                                                         _mm_setzero_si128());
2702                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2703                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2704                                                                                         _mm_setzero_si128());
2705                                         fracm = _mm_srli_epi16(subtc, 1);
2706                                         pix1 = _mm_add_epi16(pix1,
2707                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2708                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2709                                         pix3 = _mm_add_epi16(pix3,
2710                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2711                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2712                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2713                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2714                                         pix2 = _mm_add_epi16(pix2,
2715                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2716                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2717                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2718                                 }
2719                                 if (x <= endsub)
2720                                 {
2721                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2722                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2723                                         tci = _mm_madd_epi16(tci, tcoffset);
2724                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2725                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2726                                                                                         _mm_setzero_si128());
2727                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2728                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2729                                                                                         _mm_setzero_si128());
2730                                         fracm = _mm_srli_epi16(subtc, 1);
2731                                         pix1 = _mm_add_epi16(pix1,
2732                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2733                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2734                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2735                                         pix1 = _mm_add_epi16(pix1,
2736                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2737                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2738                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2739                                         x++;
2740                                 }
2741                         }
2742                         else
2743                         {
2744                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2745                                 {
2746                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2747                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2748                                         tci = _mm_madd_epi16(tci, tcoffset);
2749                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2750                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2751                                                                                         _mm_setzero_si128());
2752                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2753                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2754                                                                                         _mm_setzero_si128());
2755                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2756                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2757                                         tci = _mm_madd_epi16(tci, tcoffset);
2758                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2759                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2760                                                                                         _mm_setzero_si128());
2761                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2762                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2763                                                                                         _mm_setzero_si128());
2764                                         fracm = _mm_srli_epi16(subtc, 1);
2765                                         pix1 = _mm_add_epi16(pix1,
2766                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2767                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2768                                         pix3 = _mm_add_epi16(pix3,
2769                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2770                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2771                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2772                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2773                                         pix2 = _mm_add_epi16(pix2,
2774                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2775                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2776                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2777                                 }
2778                                 if (x <= endsub)
2779                                 {
2780                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2781                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2782                                         tci = _mm_madd_epi16(tci, tcoffset);
2783                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2784                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2785                                                                                         _mm_setzero_si128());
2786                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2787                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2788                                                                                         _mm_setzero_si128());
2789                                         fracm = _mm_srli_epi16(subtc, 1);
2790                                         pix1 = _mm_add_epi16(pix1,
2791                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2792                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2793                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2794                                         pix1 = _mm_add_epi16(pix1,
2795                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2796                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2797                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2798                                         x++;
2799                                 }
2800                         }
2801                 }
2802                 else
2803                 {
2804                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2805                         {
2806                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2807                                 {
2808                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2809                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2810                                         tci = _mm_madd_epi16(tci, tcoffset);
2811                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2812                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2813                                 }
2814                                 if (x <= endsub)
2815                                 {
2816                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2817                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2818                                         tci = _mm_madd_epi16(tci, tcoffset);
2819                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2820                                         x++;
2821                                 }
2822                         }
2823                         else
2824                         {
2825                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2826                                 {
2827                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2828                                         tci = _mm_and_si128(tci, tcmax); 
2829                                         tci = _mm_madd_epi16(tci, tcoffset);
2830                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2831                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2832                                 }
2833                                 if (x <= endsub)
2834                                 {
2835                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2836                                         tci = _mm_and_si128(tci, tcmax); 
2837                                         tci = _mm_madd_epi16(tci, tcoffset);
2838                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2839                                         x++;
2840                                 }
2841                         }
2842                 }
2843         }
2844 #endif
2845 }
2846
2847 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2848 {
2849         // TODO: IMPLEMENT
2850         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2851 }
2852
2853 float DPSOFTRAST_SampleShadowmap(const float *vector)
2854 {
2855         // TODO: IMPLEMENT
2856         return 1.0f;
2857 }
2858
2859 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2860 {
2861         int x;
2862         int startx = span->startx;
2863         int endx = span->endx;
2864         float c[4];
2865         float data[4];
2866         float slope[4];
2867         float z;
2868         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2869         for (x = startx;x < endx;x++)
2870         {
2871                 z = zf[x];
2872                 c[0] = (data[0] + slope[0]*x) * z;
2873                 c[1] = (data[1] + slope[1]*x) * z;
2874                 c[2] = (data[2] + slope[2]*x) * z;
2875                 c[3] = (data[3] + slope[3]*x) * z;
2876                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2877                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2878                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2879                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2880         }
2881 }
2882
2883 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2884 {
2885         int x;
2886         int startx = span->startx;
2887         int endx = span->endx;
2888         float c[4];
2889         float data[4];
2890         float slope[4];
2891         float z;
2892         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2893         for (x = startx;x < endx;x++)
2894         {
2895                 z = zf[x];
2896                 c[0] = (data[0] + slope[0]*x) * z;
2897                 c[1] = (data[1] + slope[1]*x) * z;
2898                 c[2] = (data[2] + slope[2]*x) * z;
2899                 c[3] = (data[3] + slope[3]*x) * z;
2900                 out4f[x*4+0] = c[0];
2901                 out4f[x*4+1] = c[1];
2902                 out4f[x*4+2] = c[2];
2903                 out4f[x*4+3] = c[3];
2904         }
2905 }
2906
2907 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2908 {
2909         int x, startx = span->startx, endx = span->endx;
2910         float c[4], localcolor[4];
2911         localcolor[0] = subcolor[0];
2912         localcolor[1] = subcolor[1];
2913         localcolor[2] = subcolor[2];
2914         localcolor[3] = subcolor[3];
2915         for (x = startx;x < endx;x++)
2916         {
2917                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2918                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2919                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2920                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2921                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2922                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2923                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2924                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2925         }
2926 }
2927
2928 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2929 {
2930         int x, startx = span->startx, endx = span->endx;
2931         for (x = startx;x < endx;x++)
2932         {
2933                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2934                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2935                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2936                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2937         }
2938 }
2939
2940 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2941 {
2942         int x, startx = span->startx, endx = span->endx;
2943         for (x = startx;x < endx;x++)
2944         {
2945                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2946                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2947                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2948                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2949         }
2950 }
2951
2952 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2953 {
2954         int x, startx = span->startx, endx = span->endx;
2955         float a, b;
2956         for (x = startx;x < endx;x++)
2957         {
2958                 a = 1.0f - inb4f[x*4+3];
2959                 b = inb4f[x*4+3];
2960                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2961                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2962                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2963                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2964         }
2965 }
2966
2967 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2968 {
2969         int x, startx = span->startx, endx = span->endx;
2970         float localcolor[4], ilerp, lerp;
2971         localcolor[0] = color[0];
2972         localcolor[1] = color[1];
2973         localcolor[2] = color[2];
2974         localcolor[3] = color[3];
2975         ilerp = 1.0f - localcolor[3];
2976         lerp = localcolor[3];
2977         for (x = startx;x < endx;x++)
2978         {
2979                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2980                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2981                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2982                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2983         }
2984 }
2985
2986
2987
2988 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2989 {
2990 #ifdef SSE_POSSIBLE
2991         int x;
2992         int startx = span->startx;
2993         int endx = span->endx;
2994         __m128 data, slope;
2995         __m128 mod, endmod;
2996         __m128i submod, substep, endsubmod;
2997         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2998         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2999         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3000         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3001         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3002         for (x = startx; x < endx;)
3003         {
3004                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3005                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3006                 if (nextsub >= endx)
3007                 {
3008                         nextsub = endsub = endx-1;
3009                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3010                 }
3011                 mod = endmod;
3012                 submod = endsubmod;
3013                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3014                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3015                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3016                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3017                 substep = _mm_packs_epi32(substep, substep);
3018                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3019                 {
3020                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3021                         pix = _mm_mulhi_epu16(pix, submod);
3022                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3023                 }
3024                 if (x <= endsub)
3025                 {
3026                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3027                         pix = _mm_mulhi_epu16(pix, submod);
3028                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3029                         x++;
3030                 }
3031         }
3032 #endif
3033 }
3034
3035 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3036 {
3037 #ifdef SSE_POSSIBLE
3038         int x;
3039         int startx = span->startx;
3040         int endx = span->endx;
3041         __m128 data, slope;
3042         __m128 mod, endmod;
3043         __m128i submod, substep, endsubmod;
3044         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3045         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3046         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3047         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3048         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3049         for (x = startx; x < endx;)
3050         {
3051                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3052                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3053                 if (nextsub >= endx)
3054                 {
3055                         nextsub = endsub = endx-1;
3056                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3057                 }
3058                 mod = endmod;
3059                 submod = endsubmod;
3060                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3061                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3062                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3063                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3064                 substep = _mm_packs_epi32(substep, substep);
3065                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3066                 {
3067                         __m128i pix = _mm_srai_epi16(submod, 4);
3068                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3069                 }
3070                 if (x <= endsub)
3071                 {
3072                         __m128i pix = _mm_srai_epi16(submod, 4);
3073                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3074                         x++;
3075                 }
3076         }
3077 #endif
3078 }
3079
3080 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3081 {
3082 #ifdef SSE_POSSIBLE
3083         int x, startx = span->startx, endx = span->endx;
3084         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3085         localcolor = _mm_packs_epi32(localcolor, localcolor);
3086         for (x = startx;x+2 <= endx;x+=2)
3087         {
3088                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3089                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3090                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3091                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3092         }
3093         if (x < endx)
3094         {
3095                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3096                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3097                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3098                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3099         }
3100 #endif
3101 }
3102
3103 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3104 {
3105 #ifdef SSE_POSSIBLE
3106         int x, startx = span->startx, endx = span->endx;
3107         for (x = startx;x+2 <= endx;x+=2)
3108         {
3109                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3110                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3111                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3112                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3113         }
3114         if (x < endx)
3115         {
3116                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3117                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3118                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3119                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3120         }
3121 #endif
3122 }
3123
3124 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3125 {
3126 #ifdef SSE_POSSIBLE
3127         int x, startx = span->startx, endx = span->endx;
3128         for (x = startx;x+2 <= endx;x+=2)
3129         {
3130                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3131                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3132                 pix1 = _mm_add_epi16(pix1, pix2);
3133                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3134         }
3135         if (x < endx)
3136         {
3137                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3138                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3139                 pix1 = _mm_add_epi16(pix1, pix2);
3140                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3141         }
3142 #endif
3143 }
3144
3145 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3146 {
3147 #ifdef SSE_POSSIBLE
3148         int x, startx = span->startx, endx = span->endx;
3149         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3150         tint = _mm_packs_epi32(tint, tint);
3151         for (x = startx;x+2 <= endx;x+=2)
3152         {
3153                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3154                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3155                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3156                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3157         }
3158         if (x < endx)
3159         {
3160                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3161                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3162                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3163                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3164         }
3165 #endif
3166 }
3167
3168 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3169 {
3170 #ifdef SSE_POSSIBLE
3171         int x, startx = span->startx, endx = span->endx;
3172         for (x = startx;x+2 <= endx;x+=2)
3173         {
3174                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3175                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3176                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3177                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3178                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3179         }
3180         if (x < endx)
3181         {
3182                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3183                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3184                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3185                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3186                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3187         }
3188 #endif
3189 }
3190
3191 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3192 {
3193 #ifdef SSE_POSSIBLE
3194         int x, startx = span->startx, endx = span->endx;
3195         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3196         localcolor = _mm_packs_epi32(localcolor, localcolor);
3197         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3198         for (x = startx;x+2 <= endx;x+=2)
3199         {
3200                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3201                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3202                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3203         }
3204         if (x < endx)
3205         {
3206                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3207                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3208                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3209         }
3210 #endif
3211 }
3212
3213
3214
3215 void DPSOFTRAST_VertexShader_Generic(void)
3216 {
3217         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3218         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3219         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3220         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3221                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3222 }
3223
3224 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3225 {
3226         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3227         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3229         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3230         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3231         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3232         {
3233                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3234                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3235                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3236                 {
3237                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3238                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3239                         {
3240                                 // multiply
3241                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3242                         }
3243                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3244                         {
3245                                 // add
3246                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3247                         }
3248                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3249                         {
3250                                 // alphablend
3251                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3252                         }
3253                 }
3254         }
3255         else
3256                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3257         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3258 }
3259
3260
3261
3262 void DPSOFTRAST_VertexShader_PostProcess(void)
3263 {
3264         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3265         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3266         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3267 }
3268
3269 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3270 {
3271         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3272         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3273         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3274         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3275         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3276         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3277         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3278         {
3279                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3280                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3281         }
3282         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3283         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3284         {
3285                 // TODO: implement saturation
3286         }
3287         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3288         {
3289                 // TODO: implement gammaramps
3290         }
3291         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3292 }
3293
3294
3295
3296 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3297 {
3298         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3299 }
3300
3301 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3302 {
3303         // this is never called (because colormask is off when this shader is used)
3304         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3305         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3306         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3307         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3308         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3309 }
3310
3311
3312
3313 void DPSOFTRAST_VertexShader_FlatColor(void)
3314 {
3315         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3316         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3317 }
3318
3319 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3320 {
3321 #ifdef SSE_POSSIBLE
3322         unsigned char * RESTRICT pixelmask = span->pixelmask;
3323         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3324         int x, startx = span->startx, endx = span->endx;
3325         __m128i Color_Ambientm;
3326         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3327         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3328         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3329         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3330         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3331         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3332                 pixel = buffer_FragColorbgra8;
3333         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3334         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3335         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3336         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3337         for (x = startx;x < endx;x++)
3338         {
3339                 __m128i color, pix;
3340                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3341                 {
3342                         __m128i pix2;
3343                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3344                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3345                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3346                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3347                         x += 3;
3348                         continue;
3349                 }
3350                 if (!pixelmask[x])
3351                         continue;
3352                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3353                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3354                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3355         }
3356         if (pixel == buffer_FragColorbgra8)
3357                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3358 #endif
3359 }
3360
3361
3362
3363 void DPSOFTRAST_VertexShader_VertexColor(void)
3364 {
3365         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3366         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3367         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3368 }
3369
3370 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3371 {
3372 #ifdef SSE_POSSIBLE
3373         unsigned char * RESTRICT pixelmask = span->pixelmask;
3374         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3375         int x, startx = span->startx, endx = span->endx;
3376         __m128i Color_Ambientm, Color_Diffusem;
3377         __m128 data, slope;
3378         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3379         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3380         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3381         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3382         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3383         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3384         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3385                 pixel = buffer_FragColorbgra8;
3386         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3387         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3388         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3389         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3390         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3391         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3392         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3393         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3394         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3395         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3396         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3397         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3398         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3399         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3400         {
3401                 __m128i color, mod, pix;
3402                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3403                 {
3404                         __m128i pix2, mod2;
3405                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3406                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3407                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3408                         data = _mm_add_ps(data, slope);
3409                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3410                         data = _mm_add_ps(data, slope);
3411                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3412                         data = _mm_add_ps(data, slope);
3413                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3414                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3415                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3416                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3417                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3418                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3419                         x += 3;
3420                         continue;
3421                 }
3422                 if (!pixelmask[x])
3423                         continue;
3424                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3425                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3426                 mod = _mm_packs_epi32(mod, mod);
3427                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3428                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3429         }
3430         if (pixel == buffer_FragColorbgra8)
3431                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3432 #endif
3433 }
3434
3435
3436
3437 void DPSOFTRAST_VertexShader_Lightmap(void)
3438 {
3439         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3440         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3441         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3442 }
3443
3444 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3445 {
3446 #ifdef SSE_POSSIBLE
3447         unsigned char * RESTRICT pixelmask = span->pixelmask;
3448         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3449         int x, startx = span->startx, endx = span->endx;
3450         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3451         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3452         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3453         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3454         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3455         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3456         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3457         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3458         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3459         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3460                 pixel = buffer_FragColorbgra8;
3461         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3462         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3463         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3464         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3465         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3466         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3467         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3468         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3469         {
3470                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3471                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3472                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3473                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3474                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3475                 for (x = startx;x < endx;x++)
3476                 {
3477                         __m128i color, lightmap, glow, pix;
3478                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3479                         {
3480                                 __m128i pix2;
3481                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3482                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3483                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3484                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3485                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3486                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3487                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3488                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3489                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3490                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3491                                 x += 3;
3492                                 continue;
3493                         }
3494                         if (!pixelmask[x])
3495                                 continue;
3496                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3497                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3498                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3499                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3500                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3501                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3502                 }
3503         }
3504         else
3505         {
3506                 for (x = startx;x < endx;x++)
3507                 {
3508                         __m128i color, lightmap, pix;
3509                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3510                         {
3511                                 __m128i pix2;
3512                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3513                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3514                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3515                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3516                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3517                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3518                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3519                                 x += 3;
3520                                 continue;
3521                         }
3522                         if (!pixelmask[x]) 
3523                                 continue;
3524                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3525                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3526                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3527                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3528                 }
3529         }
3530         if (pixel == buffer_FragColorbgra8)
3531                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3532 #endif
3533 }
3534
3535
3536 void DPSOFTRAST_VertexShader_LightDirection(void);
3537 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3538
3539 void DPSOFTRAST_VertexShader_FakeLight(void)
3540 {
3541         DPSOFTRAST_VertexShader_LightDirection();
3542 }
3543
3544 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3545 {
3546         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3547 }
3548
3549
3550
3551 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3552 {
3553         DPSOFTRAST_VertexShader_LightDirection();
3554         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3555 }
3556
3557 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3558 {
3559         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3560 }
3561
3562
3563
3564 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3565 {
3566         DPSOFTRAST_VertexShader_LightDirection();
3567         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3568 }
3569
3570 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3571 {
3572         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3573 }
3574
3575
3576
3577 void DPSOFTRAST_VertexShader_LightDirection(void)
3578 {
3579         int i;
3580         int numvertices = dpsoftrast.numvertices;
3581         float LightDir[4];
3582         float LightVector[4];
3583         float EyePosition[4];
3584         float EyeVectorModelSpace[4];
3585         float EyeVector[4];
3586         float position[4];
3587         float svector[4];
3588         float tvector[4];
3589         float normal[4];
3590         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3591         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3592         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3593         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3594         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3595         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3596         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3597         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3598         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3599         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3600         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3601         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3602         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3603         for (i = 0;i < numvertices;i++)
3604         {
3605                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3606                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3607                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3608                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3609                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3610                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3611                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3612                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3613                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3614                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3615                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3616                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3617                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3618                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3619                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3620                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3621                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3622                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3623                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3624                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3625                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3626                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3627                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3628                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3629                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3630                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3631                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3632                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3633                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3634         }
3635         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3636 }
3637
3638 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3639 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3640 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3641 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3642 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3643 #define DPSOFTRAST_Vector3Normalize(v)\
3644 do\
3645 {\
3646         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3647         if (len)\
3648         {\
3649                 len = 1.0f / len;\
3650                 v[0] *= len;\
3651                 v[1] *= len;\
3652                 v[2] *= len;\
3653         }\
3654 }\
3655 while(0)
3656
3657 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3658 {
3659         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3660         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3661         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3662         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3663         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3664         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3665         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3666         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3667         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3668         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3669         int x, startx = span->startx, endx = span->endx;
3670         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3671         float LightVectordata[4];
3672         float LightVectorslope[4];
3673         float EyeVectordata[4];
3674         float EyeVectorslope[4];
3675         float VectorSdata[4];
3676         float VectorSslope[4];
3677         float VectorTdata[4];
3678         float VectorTslope[4];
3679         float VectorRdata[4];
3680         float VectorRslope[4];
3681         float z;
3682         float diffusetex[4];
3683         float glosstex[4];
3684         float surfacenormal[4];
3685         float lightnormal[4];
3686         float lightnormal_modelspace[4];
3687         float eyenormal[4];
3688         float specularnormal[4];
3689         float diffuse;
3690         float specular;
3691         float SpecularPower;
3692         int d[4];
3693         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3694         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3695         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3696         Color_Glow[3] = 0.0f;
3697         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3698         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3699         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3700         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3701         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3702         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3703         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3704         Color_Pants[3] = 0.0f;
3705         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3706         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3707         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3708         Color_Shirt[3] = 0.0f;
3709         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3710         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3711         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3712         {
3713                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3714                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3715         }
3716         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3717         {
3718                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3719         }
3720         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3721         {
3722                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3723                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3724                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3725                 Color_Diffuse[3] = 0.0f;
3726                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3727                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3728                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3729                 LightColor[3] = 0.0f;
3730                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3731                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3732                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3733                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3734                 Color_Specular[3] = 0.0f;
3735                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3736                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3737                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3738
3739                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3740                 {
3741                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3742                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3743                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3744                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3745                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3746                 }
3747                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3748                 {
3749                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3750                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3751                 }
3752                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3753                 {
3754                         // nothing of this needed
3755                 }
3756                 else
3757                 {
3758                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3759                 }
3760
3761                 for (x = startx;x < endx;x++)
3762                 {
3763                         z = buffer_z[x];
3764                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3765                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3766                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3767                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3768                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3769                         {
3770                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3771                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3772                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3773                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3774                         }
3775                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3776                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3777                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3778                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3779                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3780                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3781                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3782                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3783
3784                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3785                         {
3786                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3787                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3788                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3789                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3790
3791                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3792                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3793                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3794                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3795
3796                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3797                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3798                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3799                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3800
3801                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3802                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3803                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3804                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3805
3806                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3807                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3808
3809                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3810                                 {
3811                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3812                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3813                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3814                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3815                                 }
3816                         }
3817                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3818                         {
3819                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3820                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3821                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3822                                 {
3823                                         float f = 1.0f / 256.0f;
3824                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3825                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3826                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3827                                 }
3828                         }
3829                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3830                         {
3831                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3832                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3833                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3834                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3835
3836                                 LightColor[0] = 1.0;
3837                                 LightColor[1] = 1.0;
3838                                 LightColor[2] = 1.0;
3839                         }
3840                         else
3841                         {
3842                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3843                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3844                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3845                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3846                         }
3847
3848                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3849
3850                         if(thread->shader_exactspecularmath)
3851                         {
3852                                 // reflect lightnormal at surfacenormal, take the negative of that
3853                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3854                                 float f;
3855                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3856                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3857                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3858                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3859
3860                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3861                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3862                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3863                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3864                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3865
3866                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3867                         }
3868                         else
3869                         {
3870                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3871                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3872                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3873                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3874
3875                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3876                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3877                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3878                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3879
3880                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3881                         }
3882
3883                         specular = pow(specular, SpecularPower * glosstex[3]);
3884                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3885                         {
3886                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3887                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3888                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3889                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3890                         }
3891                         else
3892                         {
3893                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3894                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3895                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3896                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3897                         }
3898
3899                         buffer_FragColorbgra8[x*4+0] = d[0];
3900                         buffer_FragColorbgra8[x*4+1] = d[1];
3901                         buffer_FragColorbgra8[x*4+2] = d[2];
3902                         buffer_FragColorbgra8[x*4+3] = d[3];
3903                 }
3904         }
3905         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3906         {
3907                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3908                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3909                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3910                 Color_Diffuse[3] = 0.0f;
3911                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3912                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3913                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3914                 LightColor[3] = 0.0f;
3915                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3916
3917                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3918                 {
3919                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3920                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3921                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3922                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3923                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3924                 }
3925                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3926                 {
3927                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3928                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3929                 }
3930                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3931                 {
3932                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3933                 }
3934                 else
3935                 {
3936                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3937                 }
3938
3939                 for (x = startx;x < endx;x++)
3940                 {
3941                         z = buffer_z[x];
3942                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3943                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3944                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3945                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3946                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3947                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3948                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3949                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3950
3951                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3952                         {
3953                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3954                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3955                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3956                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3957
3958                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3959                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3960                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3961                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3962
3963                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3964                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3965                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3966                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3967
3968                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3969                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3970                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3971                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3972
3973                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3974                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3975
3976                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3977                                 {
3978                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3979                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3980                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3981                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3982                                 }
3983                         }
3984                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3985                         {
3986                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3987                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3988                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3989                                 {
3990                                         float f = 1.0f / 256.0f;
3991                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3992                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3993                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3994                                 }
3995                         }
3996                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3997                         {
3998                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3999                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4000                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4001                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4002
4003                                 LightColor[0] = 1.0;
4004                                 LightColor[1] = 1.0;
4005                                 LightColor[2] = 1.0;
4006                         }
4007                         else
4008                         {
4009                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4010                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4011                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4012                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4013                         }
4014
4015                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4016                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4017                         {
4018                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4019                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4020                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4021                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4022                         }
4023                         else
4024                         {
4025                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4026                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4027                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4028                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4029                         }
4030                         buffer_FragColorbgra8[x*4+0] = d[0];
4031                         buffer_FragColorbgra8[x*4+1] = d[1];
4032                         buffer_FragColorbgra8[x*4+2] = d[2];
4033                         buffer_FragColorbgra8[x*4+3] = d[3];
4034                 }
4035         }
4036         else
4037         {
4038                 for (x = startx;x < endx;x++)
4039                 {
4040                         z = buffer_z[x];
4041                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4042                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4043                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4044                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4045
4046                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4047                         {
4048                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4049                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4050                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4051                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4052                         }
4053                         else
4054                         {
4055                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4056                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4057                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4058                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4059                         }
4060                         buffer_FragColorbgra8[x*4+0] = d[0];
4061                         buffer_FragColorbgra8[x*4+1] = d[1];
4062                         buffer_FragColorbgra8[x*4+2] = d[2];
4063                         buffer_FragColorbgra8[x*4+3] = d[3];
4064                 }
4065         }
4066         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4067 }
4068
4069
4070
4071 void DPSOFTRAST_VertexShader_LightSource(void)
4072 {
4073         int i;
4074         int numvertices = dpsoftrast.numvertices;
4075         float LightPosition[4];
4076         float LightVector[4];
4077         float LightVectorModelSpace[4];
4078         float EyePosition[4];
4079         float EyeVectorModelSpace[4];
4080         float EyeVector[4];
4081         float position[4];
4082         float svector[4];
4083         float tvector[4];
4084         float normal[4];
4085         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4086         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4087         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4088         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4089         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4090         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4091         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4092         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4093         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4094         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4095         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4096         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4097         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4098         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4099         for (i = 0;i < numvertices;i++)
4100         {
4101                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4102                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4103                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4104                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4105                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4106                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4107                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4108                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4109                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4110                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4111                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4112                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4113                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4114                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4115                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4116                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4117                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4118                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4119                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4120                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4121                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4122                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4123                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4124                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4125                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4126                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4127                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4128                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4129                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4130                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4131                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4132                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4133         }
4134         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4135         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4136 }
4137
4138 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4139 {
4140 #ifdef SSE_POSSIBLE
4141         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4142         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4143         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4144         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4145         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4146         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4147         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4148         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4149         int x, startx = span->startx, endx = span->endx;
4150         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4151         float CubeVectordata[4];
4152         float CubeVectorslope[4];
4153         float LightVectordata[4];
4154         float LightVectorslope[4];
4155         float EyeVectordata[4];
4156         float EyeVectorslope[4];
4157         float z;
4158         float diffusetex[4];
4159         float glosstex[4];
4160         float surfacenormal[4];
4161         float lightnormal[4];
4162         float eyenormal[4];
4163         float specularnormal[4];
4164         float diffuse;
4165         float specular;
4166         float SpecularPower;
4167         float CubeVector[4];
4168         float attenuation;
4169         int d[4];
4170         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4171         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4172         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4173         Color_Glow[3] = 0.0f;
4174         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4175         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4176         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4177         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4178         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4179         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4180         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4181         Color_Diffuse[3] = 0.0f;
4182         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4183         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4184         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4185         Color_Specular[3] = 0.0f;
4186         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4187         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4188         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4189         Color_Pants[3] = 0.0f;
4190         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4191         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4192         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4193         Color_Shirt[3] = 0.0f;
4194         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4195         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4196         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4197         LightColor[3] = 0.0f;
4198         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4199         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4200         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4201         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4202         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4203         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4204         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4205         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4206         {
4207                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4208                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4209         }
4210         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4211                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4212         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4213         {
4214                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4215                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4216                 for (x = startx;x < endx;x++)
4217                 {
4218                         z = buffer_z[x];
4219                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4220                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4221                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4222                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4223                         if (attenuation < 0.01f)
4224                                 continue;
4225                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4226                         {
4227                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4228                                 if (attenuation < 0.01f)
4229                                         continue;
4230                         }
4231
4232                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4233                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4234                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4235                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4236                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4237                         {
4238                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4239                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4240                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4241                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4242                         }
4243                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4244                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4245                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4246                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4247                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4248                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4249                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4250                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4251
4252                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4253                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4254                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4255                         DPSOFTRAST_Vector3Normalize(lightnormal);
4256
4257                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4258
4259                         if(thread->shader_exactspecularmath)
4260                         {
4261                                 // reflect lightnormal at surfacenormal, take the negative of that
4262                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4263                                 float f;
4264                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4265                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4266                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4267                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4268
4269                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4270                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4271                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4272                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4273                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4274
4275                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4276                         }
4277                         else
4278                         {
4279                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4280                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4281                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4282                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4283
4284                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4285                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4286                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4287                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4288
4289                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4290                         }
4291                         specular = pow(specular, SpecularPower * glosstex[3]);
4292
4293                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4294                         {
4295                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4296                                 attenuation *= (1.0f / 255.0f);
4297                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4298                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4299                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4300                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4301                         }
4302                         else
4303                         {
4304                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4305                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4306                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4307                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4308                         }
4309                         buffer_FragColorbgra8[x*4+0] = d[0];
4310                         buffer_FragColorbgra8[x*4+1] = d[1];
4311                         buffer_FragColorbgra8[x*4+2] = d[2];
4312                         buffer_FragColorbgra8[x*4+3] = d[3];
4313                 }
4314         }
4315         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4316         {
4317                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4318                 for (x = startx;x < endx;x++)
4319                 {
4320                         z = buffer_z[x];
4321                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4322                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4323                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4324                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4325                         if (attenuation < 0.01f)
4326                                 continue;
4327                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4328                         {
4329                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4330                                 if (attenuation < 0.01f)
4331                                         continue;
4332                         }
4333
4334                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4335                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4336                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4337                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4338                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4339                         {
4340                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4341                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4342                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4343                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4344                         }
4345                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4346                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4347                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4348                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4349
4350                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4351                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4352                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4353                         DPSOFTRAST_Vector3Normalize(lightnormal);
4354
4355                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4356                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4357                         {
4358                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4359                                 attenuation *= (1.0f / 255.0f);
4360                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4361                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4362                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4363                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4364                         }
4365                         else
4366                         {
4367                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4368                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4369                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4370                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4371                         }
4372                         buffer_FragColorbgra8[x*4+0] = d[0];
4373                         buffer_FragColorbgra8[x*4+1] = d[1];
4374                         buffer_FragColorbgra8[x*4+2] = d[2];
4375                         buffer_FragColorbgra8[x*4+3] = d[3];
4376                 }
4377         }
4378         else
4379         {
4380                 for (x = startx;x < endx;x++)
4381                 {
4382                         z = buffer_z[x];
4383                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4384                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4385                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4386                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4387                         if (attenuation < 0.01f)
4388                                 continue;
4389                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4390                         {
4391                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4392                                 if (attenuation < 0.01f)
4393                                         continue;
4394                         }
4395
4396                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4397                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4398                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4399                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4400                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4401                         {
4402                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4403                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4404                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4405                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4406                         }
4407                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4408                         {
4409                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4410                                 attenuation *= (1.0f / 255.0f);
4411                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4412                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4413                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4414                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4415                         }
4416                         else
4417                         {
4418                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4419                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4420                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4421                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4422                         }
4423                         buffer_FragColorbgra8[x*4+0] = d[0];
4424                         buffer_FragColorbgra8[x*4+1] = d[1];
4425                         buffer_FragColorbgra8[x*4+2] = d[2];
4426                         buffer_FragColorbgra8[x*4+3] = d[3];
4427                 }
4428         }
4429         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4430 #endif
4431 }
4432
4433
4434
4435 void DPSOFTRAST_VertexShader_Refraction(void)
4436 {
4437         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4438         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4439         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4440 }
4441
4442 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4443 {
4444         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4445         float z;
4446         int x, startx = span->startx, endx = span->endx;
4447
4448         // texture reads
4449         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4450         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4451
4452         // varyings
4453         float ModelViewProjectionPositiondata[4];
4454         float ModelViewProjectionPositionslope[4];
4455
4456         // uniforms
4457         float ScreenScaleRefractReflect[2];
4458         float ScreenCenterRefractReflect[2];
4459         float DistortScaleRefractReflect[2];
4460         float RefractColor[4];
4461
4462         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4463         if(!texture) return;
4464
4465         // read textures
4466         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4467         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4468
4469         // read varyings
4470         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4471
4472         // read uniforms
4473         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4474         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4475         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4476         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4477         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4478         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4479         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4480         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4481         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4482         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4483
4484         // do stuff
4485         for (x = startx;x < endx;x++)
4486         {
4487                 float SafeScreenTexCoord[2];
4488                 float ScreenTexCoord[2];
4489                 float v[3];
4490                 float iw;
4491                 unsigned char c[4];
4492
4493                 z = buffer_z[x];
4494
4495                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4496                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4497
4498                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4499                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4500                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4501
4502                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4503                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4504                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4505                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4506                 DPSOFTRAST_Vector3Normalize(v);
4507                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4508                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4509
4510                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4511                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4512
4513                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4514                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4515                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4516                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4517         }
4518
4519         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4520 }
4521
4522
4523
4524 void DPSOFTRAST_VertexShader_Water(void)
4525 {
4526         int i;
4527         int numvertices = dpsoftrast.numvertices;
4528         float EyePosition[4];
4529         float EyeVectorModelSpace[4];
4530         float EyeVector[4];
4531         float position[4];
4532         float svector[4];
4533         float tvector[4];
4534         float normal[4];
4535         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4536         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4537         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4538         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4539         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4540         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4541         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4542         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4543         for (i = 0;i < numvertices;i++)
4544         {
4545                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4546                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4547                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4548                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4549                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4550                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4551                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4552                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4553                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4554                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4555                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4556                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4557                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4558                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4559                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4560                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4561                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4562                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4563                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4564                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4565                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4566                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4567         }
4568         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4569         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4570         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4571 }
4572
4573
4574 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4575 {
4576         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4577         float z;
4578         int x, startx = span->startx, endx = span->endx;
4579
4580         // texture reads
4581         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4582         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4583
4584         // varyings
4585         float ModelViewProjectionPositiondata[4];
4586         float ModelViewProjectionPositionslope[4];
4587         float EyeVectordata[4];
4588         float EyeVectorslope[4];
4589
4590         // uniforms
4591         float ScreenScaleRefractReflect[2];
4592         float ScreenCenterRefractReflect[2];
4593         float DistortScaleRefractReflect[2];
4594         float RefractColor[4];
4595         float ReflectColor[4];
4596         float ReflectFactor;
4597         float ReflectOffset;
4598
4599         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4600         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4601         if(!texture_refraction || !texture_reflection) return;
4602
4603         // read textures
4604         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4605         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4606
4607         // read varyings
4608         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4609         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4610
4611         // read uniforms
4612         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4613         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4614         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4615         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4616         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4617         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4618         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4619         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4620         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4621         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4622         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4623         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4624         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4625         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4626         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4627         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4628         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4629         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4630         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4631         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4632         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4633         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4634
4635         // do stuff
4636         for (x = startx;x < endx;x++)
4637         {
4638                 float SafeScreenTexCoord[4];
4639                 float ScreenTexCoord[4];
4640                 float v[3];
4641                 float iw;
4642                 unsigned char c1[4];
4643                 unsigned char c2[4];
4644                 float Fresnel;
4645
4646                 z = buffer_z[x];
4647
4648                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4649                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4650
4651                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4652                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4653                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4654                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4655                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4656
4657                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4658                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4659                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4660                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4661                 DPSOFTRAST_Vector3Normalize(v);
4662                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4663                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4664                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4665                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4666
4667                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4668                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4669                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4670                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4671                 DPSOFTRAST_Vector3Normalize(v);
4672                 Fresnel = 1.0f - v[2];
4673                 Fresnel = min(1.0f, Fresnel);
4674                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4675
4676                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4677                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4678                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4679                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4680
4681                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4682                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4683                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4684                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4685         }
4686
4687         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4688 }
4689
4690
4691
4692 void DPSOFTRAST_VertexShader_ShowDepth(void)
4693 {
4694         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4695 }
4696
4697 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4698 {
4699         // TODO: IMPLEMENT
4700         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4701         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4702         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4703         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4704         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4705 }
4706
4707
4708
4709 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4710 {
4711         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4712 }
4713
4714 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4715 {
4716         // TODO: IMPLEMENT
4717         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4718         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4719         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4720         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4721         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4722 }
4723
4724
4725
4726 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4727 {
4728         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4729 }
4730
4731 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4732 {
4733         // TODO: IMPLEMENT
4734         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4735         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4736         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4737         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4738         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4739 }
4740
4741
4742
4743 typedef struct DPSOFTRAST_ShaderModeInfo_s
4744 {
4745         int lodarrayindex;
4746         void (*Vertex)(void);
4747         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4748         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4749         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4750 }
4751 DPSOFTRAST_ShaderModeInfo;
4752
4753 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4754 {
4755         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4756         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4757         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4758         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4759         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4760         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4761         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4762         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4763         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4764         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4765         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4766         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4767         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4768         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4769         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4770         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4771 };
4772
4773 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4774 {
4775         int x;
4776         int startx;
4777         int endx;
4778         unsigned int *depthpixel;
4779         int depth;
4780         int depthslope;
4781         unsigned int d;
4782         unsigned char *pixelmask;
4783         DPSOFTRAST_State_Triangle *triangle;
4784         triangle = &thread->triangles[span->triangle];
4785         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4786         startx = span->startx;
4787         endx = span->endx;
4788         depth = span->depthbase;
4789         depthslope = span->depthslope;
4790         pixelmask = thread->pixelmaskarray;
4791         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4792         {
4793                 switch(thread->fb_depthfunc)
4794                 {
4795                 default:
4796                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4797                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4798                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4799                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4800                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4801                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4802                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4803                 }
4804                 while (startx < endx && !pixelmask[startx])
4805                         startx++;
4806                 while (endx > startx && !pixelmask[endx-1])
4807                         endx--;
4808         }
4809         else
4810         {
4811                 // no depth testing means we're just dealing with color...
4812                 memset(pixelmask + startx, 1, endx - startx);
4813         }
4814         span->pixelmask = pixelmask;
4815         span->startx = startx;
4816         span->endx = endx;
4817 }
4818
4819 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4820 {
4821         int x, d, depth, depthslope, startx, endx;
4822         const unsigned char *pixelmask;
4823         unsigned int *depthpixel;
4824         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4825         {
4826                 depth = span->depthbase;
4827                 depthslope = span->depthslope;
4828                 pixelmask = span->pixelmask;
4829                 startx = span->startx;
4830                 endx = span->endx;
4831                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4832                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4833                         if (pixelmask[x])
4834                                 depthpixel[x] = d;
4835         }
4836 }
4837
4838 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4839 {
4840         int i;
4841         DPSOFTRAST_State_Triangle *triangle;
4842         DPSOFTRAST_State_Span *span;
4843         for (i = 0; i < thread->numspans; i++)
4844         {
4845                 span = &thread->spans[i];
4846                 triangle = &thread->triangles[span->triangle];
4847                 DPSOFTRAST_Draw_DepthTest(thread, span);
4848                 if (span->startx >= span->endx)
4849                         continue;
4850                 // run pixel shader if appropriate
4851                 // do this before running depthmask code, to allow the pixelshader
4852                 // to clear pixelmask values for alpha testing
4853                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4854                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4855                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4856         }
4857         thread->numspans = 0;
4858 }
4859
4860 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4861
4862 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4863 {
4864 #ifdef SSE_POSSIBLE
4865         int cullface = thread->cullface;
4866         int minx, maxx, miny, maxy;
4867         int miny1, maxy1, miny2, maxy2;
4868         __m128i fbmin, fbmax;
4869         __m128 viewportcenter, viewportscale;
4870         int firstvertex = command->firstvertex;
4871         int numvertices = command->numvertices;
4872         int numtriangles = command->numtriangles;
4873         const int *element3i = command->element3i;
4874         const unsigned short *element3s = command->element3s;
4875         int clipped = command->clipped;
4876         int i;
4877         int j;
4878         int k;
4879         int y;
4880         int e[3];
4881         __m128i screeny;
4882         int starty, endy, bandy;
4883         int numpoints;
4884         int clipcase;
4885         float clipdist[4];
4886         float clip0origin, clip0slope;
4887         int clip0dir;
4888         __m128 triangleedge1, triangleedge2, trianglenormal;
4889         __m128 clipfrac[3];
4890         __m128 screen[4];
4891         DPSOFTRAST_State_Triangle *triangle;
4892         DPSOFTRAST_Texture *texture;
4893         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4894         miny = thread->fb_scissor[1];
4895         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4896         miny1 = bound(miny, thread->miny1, maxy);
4897         maxy1 = bound(miny, thread->maxy1, maxy);
4898         miny2 = bound(miny, thread->miny2, maxy);
4899         maxy2 = bound(miny, thread->maxy2, maxy);
4900         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4901         {
4902                 if (!ATOMIC_DECREMENT(command->refcount))
4903                 {
4904                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4905                                 MM_FREE(command->arrays);
4906                 }
4907                 return;
4908         }
4909         minx = thread->fb_scissor[0];
4910         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4911         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4912         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4913         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4914         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4915         screen[3] = _mm_setzero_ps();
4916         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4917         for (i = 0;i < numtriangles;i++)
4918         {
4919                 const float *screencoord4f = command->arrays;
4920                 const float *arrays = screencoord4f + numvertices*4;
4921
4922                 // generate the 3 edges of this triangle
4923                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4924                 if (element3s)
4925                 {
4926                         e[0] = element3s[i*3+0] - firstvertex;
4927                         e[1] = element3s[i*3+1] - firstvertex;
4928                         e[2] = element3s[i*3+2] - firstvertex;
4929                 }
4930                 else if (element3i)
4931                 {
4932                         e[0] = element3i[i*3+0] - firstvertex;
4933                         e[1] = element3i[i*3+1] - firstvertex;
4934                         e[2] = element3i[i*3+2] - firstvertex;
4935                 }
4936                 else
4937                 {
4938                         e[0] = i*3+0;
4939                         e[1] = i*3+1;
4940                         e[2] = i*3+2;
4941                 }
4942
4943 #define SKIPBACKFACE \
4944                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4945                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4946                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4947                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4948                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4949                 switch(cullface) \
4950                 { \
4951                 case GL_BACK: \
4952                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4953                                 continue; \
4954                         break; \
4955                 case GL_FRONT: \
4956                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4957                                 continue; \
4958                         break; \
4959                 }
4960
4961 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4962                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4963                         { \
4964                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4965                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4966                         }
4967 #define CLIPPEDVERTEXCOPY(k,p1) \
4968                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4969
4970 #define GENATTRIBCOPY(attrib, p1) \
4971                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4972 #define GENATTRIBLERP(attrib, p1, p2) \
4973                 { \
4974                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4975                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4976                 }
4977 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4978                 switch(clipcase) \
4979                 { \
4980                 default: \
4981                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4982                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4983                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4984                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4985                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4986                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4987                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4988                 }
4989
4990                 if (! clipped)
4991                         goto notclipped;
4992
4993                 // calculate distance from nearplane
4994                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4995                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4996                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4997                 if (clipdist[0] >= 0.0f)
4998                 {
4999                         if (clipdist[1] >= 0.0f)
5000                         {
5001                                 if (clipdist[2] >= 0.0f)
5002                                 {
5003                                 notclipped:
5004                                         // triangle is entirely in front of nearplane
5005                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5006                                         SKIPBACKFACE;
5007                                         numpoints = 3;
5008                                         clipcase = 0;
5009                                 }
5010                                 else
5011                                 {
5012                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5013                                         SKIPBACKFACE;
5014                                         numpoints = 4;
5015                                         clipcase = 1;
5016                                 }
5017                         }
5018                         else
5019                         {
5020                                 if (clipdist[2] >= 0.0f)
5021                                 {
5022                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5023                                         SKIPBACKFACE;
5024                                         numpoints = 4;
5025                                         clipcase = 2;
5026                                 }
5027                                 else
5028                                 {
5029                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5030                                         SKIPBACKFACE;
5031                                         numpoints = 3;
5032                                         clipcase = 3;
5033                                 }
5034                         }
5035                 }
5036                 else if (clipdist[1] >= 0.0f)
5037                 {
5038                         if (clipdist[2] >= 0.0f)
5039                         {
5040                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5041                                 SKIPBACKFACE;
5042                                 numpoints = 4;
5043                                 clipcase = 4;
5044                         }
5045                         else
5046                         {
5047                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5048                                 SKIPBACKFACE;
5049                                 numpoints = 3;
5050                                 clipcase = 5;
5051                         }
5052                 }
5053                 else if (clipdist[2] >= 0.0f)
5054                 {
5055                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5056                         SKIPBACKFACE;
5057                         numpoints = 3;
5058                         clipcase = 6;
5059                 }
5060                 else continue; // triangle is entirely behind nearplane
5061
5062                 {
5063                         // calculate integer y coords for triangle points
5064                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5065                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5066                                         screenmin = _mm_min_epi16(screeni, screenir),
5067                                         screenmax = _mm_max_epi16(screeni, screenir);
5068                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5069                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5070                         screenmin = _mm_max_epi16(screenmin, fbmin);
5071                         screenmax = _mm_min_epi16(screenmax, fbmax);
5072                         // skip offscreen triangles
5073                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5074                                 continue;
5075                         starty = _mm_extract_epi16(screenmin, 1);
5076                         endy = _mm_extract_epi16(screenmax, 1)+1;
5077                         if (starty >= maxy1 && endy <= miny2)
5078                                 continue;
5079                         screeny = _mm_srai_epi32(screeni, 16);
5080                 }
5081
5082                 triangle = &thread->triangles[thread->numtriangles];
5083
5084                 // calculate attribute plans for triangle data...
5085                 // okay, this triangle is going to produce spans, we'd better project
5086                 // the interpolants now (this is what gives perspective texturing),
5087                 // this consists of simply multiplying all arrays by the W coord
5088                 // (which is basically 1/Z), which will be undone per-pixel
5089                 // (multiplying by Z again) to get the perspective-correct array
5090                 // values
5091                 {
5092                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5093                         __m128 mipedgescale, mipdensity;
5094                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5095                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5096                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5097                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5098                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5099                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5100                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5101                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5102                         attribedge1 = _mm_sub_ss(w0, w1);
5103                         attribedge2 = _mm_sub_ss(w2, w1);
5104                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5105                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5106                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5107                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5108                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5109                         _mm_store_ss(&triangle->w[0], attribxslope);
5110                         _mm_store_ss(&triangle->w[1], attribyslope);
5111                         _mm_store_ss(&triangle->w[2], attriborigin);
5112                         
5113                         clip0origin = 0;
5114                         clip0slope = 0;
5115                         clip0dir = 0;
5116                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5117                         {
5118                                 float cliporigin, clipxslope, clipyslope;
5119                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5120                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5121                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5122                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5123                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5124                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5125                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5126                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5127                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5128                                 if(clipxslope != 0)
5129                                 {
5130                                         clip0origin = -cliporigin/clipxslope;
5131                                         clip0slope = -clipyslope/clipxslope;
5132                                         clip0dir = clipxslope > 0 ? 1 : -1;
5133                                 }
5134                                 else if(clipyslope > 0)
5135                                 {
5136                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5137                                         clip0slope = dpsoftrast.fb_width;
5138                                         clip0dir = -1;
5139                                 }
5140                                 else if(clipyslope < 0)
5141                                 {
5142                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5143                                         clip0slope = -dpsoftrast.fb_width;
5144                                         clip0dir = -1;
5145                                 }
5146                                 else if(clip0origin < 0) continue;
5147                         }
5148
5149                         mipedgescale = _mm_setzero_ps();
5150                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5151                         {
5152                                 __m128 attrib0, attrib1, attrib2;
5153                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5154                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5155                                         break;
5156                                 arrays += numvertices*4;
5157                                 GENATTRIBS(attrib0, attrib1, attrib2);
5158                                 attriborigin = _mm_mul_ps(attrib1, w1);
5159                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5160                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5161                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5162                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5163                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5164                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5165                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5166                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5167                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5168                                 {
5169                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5170                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5171                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5172                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5173                                 }
5174                         }
5175
5176                         memset(triangle->mip, 0, sizeof(triangle->mip));
5177                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5178                         {
5179                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5180                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5181                                         break;
5182                                 texture = thread->texbound[texunit];
5183                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5184                                 {
5185                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5186                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5187                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5188                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5189                                         // this will be multiplied in the texturing routine by the texture resolution
5190                                         y = _mm_cvtss_si32(mipdensity);
5191                                         if (y > 0)
5192                                         {
5193                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5194                                                 if (y > texture->mipmaps - 1)
5195                                                         y = texture->mipmaps - 1;
5196                                                 triangle->mip[texunit] = y;
5197                                         }
5198                                 }
5199                         }
5200                 }
5201         
5202                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5203                 for (; y < bandy;)
5204                 {
5205                         __m128 xcoords, xslope;
5206                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5207                         int yccmask = _mm_movemask_epi8(ycc);
5208                         int edge0p, edge0n, edge1p, edge1n;
5209                         int nexty;
5210                         float w, wslope;
5211                         float clip0;
5212                         if (numpoints == 4)
5213                         {
5214                                 switch(yccmask)
5215                                 {
5216                                 default:
5217                                 case 0xFFFF: /*0000*/ y = endy; continue;
5218                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5219                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5220                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5221                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5222                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5223                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5224                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5225                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5226                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5227                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5228                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5229                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5230                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5231                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5232                                 case 0x0000: /*1111*/ y++; continue;
5233                                 }
5234                         }
5235                         else
5236                         {
5237                                 switch(yccmask)
5238                                 {
5239                                 default:
5240                                 case 0xFFFF: /*000*/ y = endy; continue;
5241                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5242                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5243                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5244                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5245                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5246                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5247                                 case 0x0000: /*111*/ y++; continue;
5248                                 }
5249                         }
5250                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5251                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5252                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5253                         nexty = _mm_extract_epi16(ycc, 0);
5254                         if (nexty >= bandy) nexty = bandy-1;
5255                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5256                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5257                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5258                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5259                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5260                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5261                         {
5262                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5263                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5264                         }
5265                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5266                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5267                         {
5268                                 int startx, endx, offset;
5269                                 startx = _mm_cvtss_si32(xcoords);
5270                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5271                                 if (startx < minx) startx = minx;
5272                                 if (endx > maxx) endx = maxx;
5273                                 if (startx >= endx) continue;
5274
5275                                 if (clip0dir)
5276                                 {
5277                                         if (clip0dir > 0)
5278                                         {
5279                                                 if (startx < clip0) 
5280                                                 {
5281                                                         if(endx <= clip0) continue;
5282                                                         startx = (int)clip0;
5283                                                 }
5284                                         }
5285                                         else if (endx > clip0) 
5286                                         {
5287                                                 if(startx >= clip0) continue;
5288                                                 endx = (int)clip0;
5289                                         }
5290                                 }
5291                                                 
5292                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5293                                 {
5294                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5295                                         span->triangle = thread->numtriangles;
5296                                         span->x = offset;
5297                                         span->y = y;
5298                                         span->startx = 0;
5299                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5300                                         if (span->startx >= span->endx)
5301                                                 continue;
5302                                         wslope = triangle->w[0];
5303                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5304                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5305                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5306                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5307                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5308                                 }
5309                         }
5310                 }
5311
5312                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5313                 {
5314                         DPSOFTRAST_Draw_ProcessSpans(thread);
5315                         thread->numtriangles = 0;
5316                 }
5317         }
5318
5319         if (!ATOMIC_DECREMENT(command->refcount))
5320         {
5321                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5322                         MM_FREE(command->arrays);
5323         }
5324
5325         if (thread->numspans > 0 || thread->numtriangles > 0)
5326         {
5327                 DPSOFTRAST_Draw_ProcessSpans(thread);
5328                 thread->numtriangles = 0;
5329         }
5330 #endif
5331 }
5332
5333 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5334 {
5335         int i;
5336         int j;
5337         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5338         int datasize = 2*numvertices*sizeof(float[4]);
5339         DPSOFTRAST_Command_Draw *command;
5340         unsigned char *data;
5341         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5342         {
5343                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5344                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5345                         break;
5346                 datasize += numvertices*sizeof(float[4]);
5347         }
5348         if (element3s)
5349                 datasize += numtriangles*sizeof(unsigned short[3]);
5350         else if (element3i)
5351                 datasize += numtriangles*sizeof(int[3]);
5352         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5353         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5354         {
5355                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5356                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5357         }
5358         else
5359         {
5360                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5361                 data = (unsigned char *)command + commandsize;
5362         }
5363         command->firstvertex = firstvertex;
5364         command->numvertices = numvertices;
5365         command->numtriangles = numtriangles;
5366         command->arrays = (float *)data;
5367         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5368         dpsoftrast.firstvertex = firstvertex;
5369         dpsoftrast.numvertices = numvertices;
5370         dpsoftrast.screencoord4f = (float *)data;
5371         data += numvertices*sizeof(float[4]);
5372         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5373         data += numvertices*sizeof(float[4]);
5374         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5375         {
5376                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5377                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5378                         break;
5379                 dpsoftrast.post_array4f[j] = (float *)data;
5380                 data += numvertices*sizeof(float[4]);
5381         }
5382         command->element3i = NULL;
5383         command->element3s = NULL;
5384         if (element3s)
5385         {
5386                 command->element3s = (unsigned short *)data;
5387                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5388         }
5389         else if (element3i)
5390         {
5391                 command->element3i = (int *)data;
5392                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5393         }
5394         return command;
5395 }
5396
5397 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5398 {
5399         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5400         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5401         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5402         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5403         if (command->starty >= command->endy)
5404         {
5405                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5406                         MM_FREE(command->arrays);
5407                 DPSOFTRAST_UndoCommand(command->commandsize);
5408                 return;
5409         }
5410         command->clipped = dpsoftrast.drawclipped;
5411         command->refcount = dpsoftrast.numthreads;
5412
5413         if (dpsoftrast.usethreads)
5414         {
5415                 int i;
5416                 DPSOFTRAST_Draw_SyncCommands();
5417                 for (i = 0; i < dpsoftrast.numthreads; i++)
5418                 {
5419                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5420                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5421                                 Thread_CondSignal(thread->drawcond);
5422                 }
5423         }
5424         else
5425         {
5426                 DPSOFTRAST_Draw_FlushThreads();
5427         }
5428 }
5429
5430 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5431 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5432 {
5433         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5434 }
5435 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5436 {
5437         DPSOFTRAST_Command_SetRenderTargets *command;
5438         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5439                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5440                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5441                 DPSOFTRAST_Flush();
5442         dpsoftrast.fb_width = width;
5443         dpsoftrast.fb_height = height;
5444         dpsoftrast.fb_depthpixels = depthpixels;
5445         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5446         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5447         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5448         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5449         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5450         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5451         command->width = width;
5452         command->height = height;
5453 }
5454  
5455 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5456 {
5457         int commandoffset = thread->commandoffset;
5458         while (commandoffset != endoffset)
5459         {
5460                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5461                 switch (command->opcode)
5462                 {
5463 #define INTERPCOMMAND(name) \
5464                 case DPSOFTRAST_OPCODE_##name : \
5465                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5466                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5467                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5468                                 commandoffset = 0; \
5469                         break;
5470                 INTERPCOMMAND(Viewport)
5471                 INTERPCOMMAND(ClearColor)
5472                 INTERPCOMMAND(ClearDepth)
5473                 INTERPCOMMAND(ColorMask)
5474                 INTERPCOMMAND(DepthTest)
5475                 INTERPCOMMAND(ScissorTest)
5476                 INTERPCOMMAND(Scissor)
5477                 INTERPCOMMAND(BlendFunc)
5478                 INTERPCOMMAND(BlendSubtract)
5479                 INTERPCOMMAND(DepthMask)
5480                 INTERPCOMMAND(DepthFunc)
5481                 INTERPCOMMAND(DepthRange)
5482                 INTERPCOMMAND(PolygonOffset)
5483                 INTERPCOMMAND(CullFace)
5484                 INTERPCOMMAND(AlphaTest)
5485                 INTERPCOMMAND(AlphaFunc)
5486                 INTERPCOMMAND(SetTexture)
5487                 INTERPCOMMAND(SetShader)
5488                 INTERPCOMMAND(Uniform4f)
5489                 INTERPCOMMAND(UniformMatrix4f)
5490                 INTERPCOMMAND(Uniform1i)
5491                 INTERPCOMMAND(SetRenderTargets)
5492                 INTERPCOMMAND(ClipPlane)
5493
5494                 case DPSOFTRAST_OPCODE_Draw:
5495                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5496                         commandoffset += command->commandsize;
5497                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5498                                 commandoffset = 0;
5499                         thread->commandoffset = commandoffset;
5500                         break;
5501
5502                 case DPSOFTRAST_OPCODE_Reset:
5503                         commandoffset = 0;
5504                         break;
5505                 }
5506         }
5507         thread->commandoffset = commandoffset;
5508 }
5509
5510 static int DPSOFTRAST_Draw_Thread(void *data)
5511 {
5512         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5513         while(thread->index >= 0)
5514         {
5515                 if (thread->commandoffset != dpsoftrast.drawcommand)
5516                 {
5517                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5518                 }
5519                 else 
5520                 {
5521                         Thread_LockMutex(thread->drawmutex);
5522                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5523                         {
5524                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5525                                 thread->starving = true;
5526                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5527                                 thread->starving = false;
5528                         }
5529                         Thread_UnlockMutex(thread->drawmutex);
5530                 }
5531         }   
5532         return 0;
5533 }
5534
5535 static void DPSOFTRAST_Draw_FlushThreads(void)
5536 {
5537         DPSOFTRAST_State_Thread *thread;
5538         int i;
5539         DPSOFTRAST_Draw_SyncCommands();
5540         if (dpsoftrast.usethreads) 
5541         {
5542                 for (i = 0; i < dpsoftrast.numthreads; i++)
5543                 {
5544                         thread = &dpsoftrast.threads[i];
5545                         if (thread->commandoffset != dpsoftrast.drawcommand)
5546                         {
5547                                 Thread_LockMutex(thread->drawmutex);
5548                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5549                                         Thread_CondSignal(thread->drawcond);
5550                                 Thread_UnlockMutex(thread->drawmutex);
5551                         }
5552                 }
5553                 for (i = 0; i < dpsoftrast.numthreads; i++)
5554                 {
5555                         thread = &dpsoftrast.threads[i];
5556                         if (thread->commandoffset != dpsoftrast.drawcommand)
5557                         {
5558                                 Thread_LockMutex(thread->drawmutex);
5559                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5560                                 {
5561                                         thread->waiting = true;
5562                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5563                                         thread->waiting = false;
5564                                 }
5565                                 Thread_UnlockMutex(thread->drawmutex);
5566                         }
5567                 }
5568         }
5569         else
5570         {
5571                 for (i = 0; i < dpsoftrast.numthreads; i++)
5572                 {
5573                         thread = &dpsoftrast.threads[i];
5574                         if (thread->commandoffset != dpsoftrast.drawcommand)
5575                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5576                 }
5577         }
5578         dpsoftrast.commandpool.usedcommands = 0;
5579 }
5580
5581 void DPSOFTRAST_Flush(void)
5582 {
5583         DPSOFTRAST_Draw_FlushThreads();
5584 }
5585
5586 void DPSOFTRAST_Finish(void)
5587 {
5588         DPSOFTRAST_Flush();
5589 }
5590
5591 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5592 {
5593         int i;
5594         union
5595         {
5596                 int i;
5597                 unsigned char b[4];
5598         }
5599         u;
5600         u.i = 1;
5601         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5602         dpsoftrast.bigendian = u.b[3];
5603         dpsoftrast.fb_width = width;
5604         dpsoftrast.fb_height = height;
5605         dpsoftrast.fb_depthpixels = depthpixels;
5606         dpsoftrast.fb_colorpixels[0] = colorpixels;
5607         dpsoftrast.fb_colorpixels[1] = NULL;
5608         dpsoftrast.fb_colorpixels[1] = NULL;
5609         dpsoftrast.fb_colorpixels[1] = NULL;
5610         dpsoftrast.viewport[0] = 0;
5611         dpsoftrast.viewport[1] = 0;
5612         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5613         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5614         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5615         dpsoftrast.texture_firstfree = 1;
5616         dpsoftrast.texture_end = 1;
5617         dpsoftrast.texture_max = 0;
5618         dpsoftrast.color[0] = 1;
5619         dpsoftrast.color[1] = 1;
5620         dpsoftrast.color[2] = 1;
5621         dpsoftrast.color[3] = 1;
5622         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5623         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5624         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5625         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5626         for (i = 0; i < dpsoftrast.numthreads; i++)
5627         {
5628                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5629                 thread->index = i;
5630                 thread->cullface = GL_BACK;
5631         thread->colormask[0] = 1; 
5632                 thread->colormask[1] = 1;
5633                 thread->colormask[2] = 1;
5634                 thread->colormask[3] = 1;
5635                 thread->blendfunc[0] = GL_ONE;
5636                 thread->blendfunc[1] = GL_ZERO;
5637                 thread->depthmask = true;
5638                 thread->depthtest = true;
5639                 thread->depthfunc = GL_LEQUAL;
5640                 thread->scissortest = false;
5641                 thread->alphatest = false;
5642                 thread->alphafunc = GL_GREATER;
5643                 thread->alphavalue = 0.5f;
5644                 thread->viewport[0] = 0;
5645                 thread->viewport[1] = 0;
5646                 thread->viewport[2] = dpsoftrast.fb_width;
5647                 thread->viewport[3] = dpsoftrast.fb_height;
5648                 thread->scissor[0] = 0;
5649                 thread->scissor[1] = 0;
5650                 thread->scissor[2] = dpsoftrast.fb_width;
5651                 thread->scissor[3] = dpsoftrast.fb_height;
5652                 thread->depthrange[0] = 0;
5653                 thread->depthrange[1] = 1;
5654                 thread->polygonoffset[0] = 0;
5655                 thread->polygonoffset[1] = 0;
5656                 thread->clipplane[0] = 0;
5657                 thread->clipplane[1] = 0;
5658                 thread->clipplane[2] = 0;
5659                 thread->clipplane[3] = 1;
5660         
5661                 thread->numspans = 0;
5662                 thread->numtriangles = 0;
5663                 thread->commandoffset = 0;
5664                 thread->waiting = false;
5665                 thread->starving = false;
5666            
5667                 thread->validate = -1;
5668                 DPSOFTRAST_Validate(thread, -1);
5669  
5670                 if (dpsoftrast.usethreads)
5671                 {
5672                         thread->waitcond = Thread_CreateCond();
5673                         thread->drawcond = Thread_CreateCond();
5674                         thread->drawmutex = Thread_CreateMutex();
5675                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5676                 }
5677         }
5678         return 0;
5679 }
5680
5681 void DPSOFTRAST_Shutdown(void)
5682 {
5683         int i;
5684         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5685         {
5686                 DPSOFTRAST_State_Thread *thread;
5687                 for (i = 0; i < dpsoftrast.numthreads; i++)
5688                 {
5689                         thread = &dpsoftrast.threads[i];
5690                         Thread_LockMutex(thread->drawmutex);
5691                         thread->index = -1;
5692                         Thread_CondSignal(thread->drawcond);
5693                         Thread_UnlockMutex(thread->drawmutex);
5694                         Thread_WaitThread(thread->thread, 0);
5695                         Thread_DestroyCond(thread->waitcond);
5696                         Thread_DestroyCond(thread->drawcond);
5697                         Thread_DestroyMutex(thread->drawmutex);
5698                 }
5699         }
5700         for (i = 0;i < dpsoftrast.texture_end;i++)
5701                 if (dpsoftrast.texture[i].bytes)
5702                         MM_FREE(dpsoftrast.texture[i].bytes);
5703         if (dpsoftrast.texture)
5704                 free(dpsoftrast.texture);
5705         if (dpsoftrast.threads)
5706                 MM_FREE(dpsoftrast.threads);
5707         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5708 }
5709