]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
a646db35bd13d32da8f143f7d5b75f028e8ca292
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int viewport[4];
253         int scissor[4];
254         float depthrange[2];
255         float polygonoffset[2];
256         float clipplane[4];
257         ALIGN(float fb_clipplane[4]);
258
259         int shader_mode;
260         int shader_permutation;
261         int shader_exactspecularmath;
262
263         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
264         
265         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
267
268         // DPSOFTRAST_VALIDATE_ flags
269         int validate;
270
271         // derived values (DPSOFTRAST_VALIDATE_FB)
272         int fb_colormask;
273         int fb_scissor[4];
274         ALIGN(float fb_viewportcenter[4]);
275         ALIGN(float fb_viewportscale[4]);
276
277         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278         int fb_depthfunc;
279
280         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
281         int fb_blendmode;
282
283         // band boundaries
284         int miny1;
285         int maxy1;
286         int miny2;
287         int maxy2;
288
289         ATOMIC(volatile int commandoffset);
290
291         volatile bool waiting;
292         volatile bool starving;
293         void *waitcond;
294         void *drawcond;
295         void *drawmutex;
296
297         int numspans;
298         int numtriangles;
299         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
302 }
303 DPSOFTRAST_State_Thread);
304
305 typedef ALIGN(struct DPSOFTRAST_State_s
306 {
307         int fb_width;
308         int fb_height;
309         unsigned int *fb_depthpixels;
310         unsigned int *fb_colorpixels[4];
311
312         int viewport[4];
313         ALIGN(float fb_viewportcenter[4]);
314         ALIGN(float fb_viewportscale[4]);
315
316         float color[4];
317         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
319
320         const float *pointer_vertex3f;
321         const float *pointer_color4f;
322         const unsigned char *pointer_color4ub;
323         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324         int stride_vertex;
325         int stride_color;
326         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
329
330         int firstvertex;
331         int numvertices;
332         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333         float *screencoord4f;
334         int drawstarty;
335         int drawendy;
336         int drawclipped;
337         
338         int shader_mode;
339         int shader_permutation;
340         int shader_exactspecularmath;
341
342         int texture_max;
343         int texture_end;
344         int texture_firstfree;
345         DPSOFTRAST_Texture *texture;
346
347         int bigendian;
348
349         // error reporting
350         const char *errorstring;
351
352         bool usethreads;
353         int interlace;
354         int numthreads;
355         DPSOFTRAST_State_Thread *threads;
356
357         ATOMIC(volatile int drawcommand);
358
359         DPSOFTRAST_State_Command_Pool commandpool;
360 }
361 DPSOFTRAST_State);
362
363 DPSOFTRAST_State dpsoftrast;
364
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
369
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
372
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
374 {
375         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377         fb_viewportcenter[3] = 0.5f;
378         fb_viewportcenter[0] = 0.0f;
379         fb_viewportscale[1] = 0.5f * viewport[2];
380         fb_viewportscale[2] = -0.5f * viewport[3];
381         fb_viewportscale[3] = 0.5f;
382         fb_viewportscale[0] = 1.0f;
383 }
384
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
386 {
387         if (dpsoftrast.interlace)
388         {
389                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393         }
394         else
395         {
396                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
398         }
399 }
400
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
402 {
403         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 }
409
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
411 {
412         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413         // and viewport projection values
414         int x1, x2;
415         int y1, y2;
416         x1 = thread->scissor[0];
417         x2 = thread->scissor[0] + thread->scissor[2];
418         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419         y2 = dpsoftrast.fb_height - thread->scissor[1];
420         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
421         if (x1 < 0) x1 = 0;
422         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
423         if (y1 < 0) y1 = 0;
424         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425         thread->fb_scissor[0] = x1;
426         thread->fb_scissor[1] = y1;
427         thread->fb_scissor[2] = x2 - x1;
428         thread->fb_scissor[3] = y2 - y1;
429
430         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431         DPSOFTRAST_RecalcClipPlane(thread);
432         DPSOFTRAST_RecalcThread(thread);
433 }
434
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
436 {
437         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 }
439
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
441 {
442         if (thread->blendsubtract)
443         {
444                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445                 {
446                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450                 }
451         }
452         else
453         {       
454                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
455                 {
456                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
467                 }
468         }
469 }
470
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
472
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
474 {
475         mask &= thread->validate;
476         if (!mask)
477                 return;
478         if (mask & DPSOFTRAST_VALIDATE_FB)
479         {
480                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481                 DPSOFTRAST_RecalcFB(thread);
482         }
483         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
484         {
485                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486                 DPSOFTRAST_RecalcDepthFunc(thread);
487         }
488         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
489         {
490                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491                 DPSOFTRAST_RecalcBlendFunc(thread);
492         }
493 }
494
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
496 {
497         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498                 return &dpsoftrast.texture[index];
499         return NULL;
500 }
501
502 static void DPSOFTRAST_Texture_Grow(void)
503 {
504         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505         DPSOFTRAST_State_Thread *thread;
506         int i;
507         int j;
508         DPSOFTRAST_Flush();
509         // expand texture array as needed
510         if (dpsoftrast.texture_max < 1024)
511                 dpsoftrast.texture_max = 1024;
512         else
513                 dpsoftrast.texture_max *= 2;
514         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516                 if (dpsoftrast.texbound[i])
517                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518         for (j = 0; j < dpsoftrast.numthreads; j++)
519         {
520                 thread = &dpsoftrast.threads[j];
521                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522                         if (thread->texbound[i])
523                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
524         }
525 }
526
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
528 {
529         int w;
530         int h;
531         int d;
532         int size;
533         int s;
534         int texnum;
535         int mipmaps;
536         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538         DPSOFTRAST_Texture *texture;
539         if (width*height*depth < 1)
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542                 return 0;
543         }
544         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
545         {
546                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
547                 return 0;
548         }
549         switch(texformat)
550         {
551         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
554                 break;
555         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
557                 {
558                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
559                         return 0;
560                 }
561                 if (depth != 1)
562                 {
563                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564                         return 0;
565                 }
566                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
567                 {
568                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
569                         return 0;
570                 }
571                 break;
572         }
573         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576                 return 0;
577         }
578         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581                 return 0;
582         }
583         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
584         {
585                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586                 return 0;
587         }
588         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
589         {
590                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591                 return 0;
592         }
593         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
594         {
595                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596                 return 0;
597         }
598         // find first empty slot in texture array
599         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600                 if (!dpsoftrast.texture[texnum].bytes)
601                         break;
602         dpsoftrast.texture_firstfree = texnum + 1;
603         if (dpsoftrast.texture_max <= texnum)
604                 DPSOFTRAST_Texture_Grow();
605         if (dpsoftrast.texture_end <= texnum)
606                 dpsoftrast.texture_end = texnum + 1;
607         texture = &dpsoftrast.texture[texnum];
608         memset(texture, 0, sizeof(*texture));
609         texture->flags = flags;
610         texture->width = width;
611         texture->height = height;
612         texture->depth = depth;
613         texture->sides = sides;
614         texture->binds = 0;
615         w = width;
616         h = height;
617         d = depth;
618         size = 0;
619         mipmaps = 0;
620         w = width;
621         h = height;
622         d = depth;
623         for (;;)
624         {
625                 s = w * h * d * sides * 4;
626                 texture->mipmap[mipmaps][0] = size;
627                 texture->mipmap[mipmaps][1] = s;
628                 texture->mipmap[mipmaps][2] = w;
629                 texture->mipmap[mipmaps][3] = h;
630                 texture->mipmap[mipmaps][4] = d;
631                 size += s;
632                 mipmaps++;
633                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
634                         break;
635                 if (w > 1) w >>= 1;
636                 if (h > 1) h >>= 1;
637                 if (d > 1) d >>= 1;
638         }
639         texture->mipmaps = mipmaps;
640         texture->size = size;
641
642         // allocate the pixels now
643         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
644
645         return texnum;
646 }
647 void DPSOFTRAST_Texture_Free(int index)
648 {
649         DPSOFTRAST_Texture *texture;
650         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
651         if (texture->binds)
652                 DPSOFTRAST_Flush();
653         if (texture->bytes)
654                 MM_FREE(texture->bytes);
655         texture->bytes = NULL;
656         memset(texture, 0, sizeof(*texture));
657         // adjust the free range and used range
658         if (dpsoftrast.texture_firstfree > index)
659                 dpsoftrast.texture_firstfree = index;
660         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661                 dpsoftrast.texture_end--;
662 }
663 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
664 {
665         int i, x, y, z, w, layer0, layer1, row0, row1;
666         unsigned char *o, *i0, *i1, *i2, *i3;
667         DPSOFTRAST_Texture *texture;
668         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669         if (texture->mipmaps <= 1)
670                 return;
671         for (i = 1;i < texture->mipmaps;i++)
672         {
673                 for (z = 0;z < texture->mipmap[i][4];z++)
674                 {
675                         layer0 = z*2;
676                         layer1 = z*2+1;
677                         if (layer1 >= texture->mipmap[i-1][4])
678                                 layer1 = texture->mipmap[i-1][4]-1;
679                         for (y = 0;y < texture->mipmap[i][3];y++)
680                         {
681                                 row0 = y*2;
682                                 row1 = y*2+1;
683                                 if (row1 >= texture->mipmap[i-1][3])
684                                         row1 = texture->mipmap[i-1][3]-1;
685                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
686                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690                                 w = texture->mipmap[i][2];
691                                 if (layer1 > layer0)
692                                 {
693                                         if (texture->mipmap[i-1][2] > 1)
694                                         {
695                                                 // average 3D texture
696                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
697                                                 {
698                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
702                                                 }
703                                         }
704                                         else
705                                         {
706                                                 // average 3D mipmap with parent width == 1
707                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
708                                                 {
709                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
713                                                 }
714                                         }
715                                 }
716                                 else
717                                 {
718                                         if (texture->mipmap[i-1][2] > 1)
719                                         {
720                                                 // average 2D texture (common case)
721                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
722                                                 {
723                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
727                                                 }
728                                         }
729                                         else
730                                         {
731                                                 // 2D texture with parent width == 1
732                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
733                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
734                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
735                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
736                                         }
737                                 }
738                         }
739                 }
740         }
741 }
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
743 {
744         DPSOFTRAST_Texture *texture;
745         unsigned char *dst;
746         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
747         if (texture->binds)
748                 DPSOFTRAST_Flush();
749         if (pixels)
750         {
751                 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
752                 while (blockheight > 0)
753                 {
754                         dst -= texture->mipmap[0][2] * 4;
755                         memcpy(dst, pixels, blockwidth * 4);
756                         pixels += blockwidth * 4;
757                         blockheight--;
758                 }
759         }
760         DPSOFTRAST_Texture_CalculateMipmaps(index);
761 }
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
763 {
764         DPSOFTRAST_Texture *texture;
765         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
766         if (texture->binds)
767                 DPSOFTRAST_Flush();
768         if (pixels)
769         {
770                 int i, stride = texture->mipmap[0][2]*4;
771                 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
772                 for (i = texture->mipmap[0][3];i > 0;i--)
773                 {
774                         dst -= stride;
775                         memcpy(dst, pixels, stride);
776                         pixels += stride;
777                 }
778         }
779         DPSOFTRAST_Texture_CalculateMipmaps(index);
780 }
781 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
782 {
783         DPSOFTRAST_Texture *texture;
784         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785         return texture->mipmap[mip][2];
786 }
787 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
788 {
789         DPSOFTRAST_Texture *texture;
790         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
791         return texture->mipmap[mip][3];
792 }
793 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
794 {
795         DPSOFTRAST_Texture *texture;
796         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
797         return texture->mipmap[mip][4];
798 }
799 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
800 {
801         DPSOFTRAST_Texture *texture;
802         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
803         if (texture->binds)
804                 DPSOFTRAST_Flush();
805         return texture->bytes + texture->mipmap[mip][0];
806 }
807 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
808 {
809         DPSOFTRAST_Texture *texture;
810         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
811         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
812         {
813                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
814                 return;
815         }
816         if (texture->binds)
817                 DPSOFTRAST_Flush();
818         texture->filter = filter;
819 }
820
821 static void DPSOFTRAST_Draw_FlushThreads(void);
822
823 static void DPSOFTRAST_Draw_SyncCommands(void)
824 {
825         if(dpsoftrast.usethreads) MEMORY_BARRIER;
826         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
827 }
828
829 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
830 {
831         DPSOFTRAST_State_Thread *thread;
832         int i;
833         int freecommand = dpsoftrast.commandpool.freecommand;
834         int usedcommands = dpsoftrast.commandpool.usedcommands;
835         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
836                 return;
837         DPSOFTRAST_Draw_SyncCommands();
838         for(;;)
839         {
840                 int waitindex = -1;
841                 int commandoffset;
842                 usedcommands = 0;
843                 for (i = 0; i < dpsoftrast.numthreads; i++)
844                 {
845                         thread = &dpsoftrast.threads[i]; 
846                         commandoffset = freecommand - thread->commandoffset;
847                         if (commandoffset < 0)
848                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
849                         if (commandoffset > usedcommands)
850                         {
851                                 waitindex = i;
852                                 usedcommands = commandoffset;
853                         }
854                 }
855                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
856                         break;
857                 thread = &dpsoftrast.threads[waitindex];
858                 Thread_LockMutex(thread->drawmutex);
859                 if (thread->commandoffset != dpsoftrast.drawcommand)
860                 {
861                         thread->waiting = true;
862                         if (thread->starving) Thread_CondSignal(thread->drawcond);
863                         Thread_CondWait(thread->waitcond, thread->drawmutex);
864                         thread->waiting = false;
865                 }
866                 Thread_UnlockMutex(thread->drawmutex);
867         }
868         dpsoftrast.commandpool.usedcommands = usedcommands;
869 }
870
871 #define DPSOFTRAST_ALIGNCOMMAND(size) \
872         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
873 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
874         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
875
876 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
877 {
878         DPSOFTRAST_Command *command;
879         int freecommand = dpsoftrast.commandpool.freecommand;
880         int usedcommands = dpsoftrast.commandpool.usedcommands;
881         int extra = sizeof(DPSOFTRAST_Command);
882         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
883                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
884         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
885         {
886                 if (dpsoftrast.usethreads)
887                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
888                 else
889                         DPSOFTRAST_Draw_FlushThreads();
890                 freecommand = dpsoftrast.commandpool.freecommand;
891                 usedcommands = dpsoftrast.commandpool.usedcommands;
892         }
893         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
894         {
895                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
896                 command->opcode = DPSOFTRAST_OPCODE_Reset;
897                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
898                 freecommand = 0;
899         }
900         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
901         command->opcode = opcode;
902         command->commandsize = size;
903         freecommand += size;
904         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
905                 freecommand = 0;
906         dpsoftrast.commandpool.freecommand = freecommand;
907         dpsoftrast.commandpool.usedcommands = usedcommands + size;
908         return command;
909 }
910
911 static void DPSOFTRAST_UndoCommand(int size)
912 {
913         int freecommand = dpsoftrast.commandpool.freecommand;
914         int usedcommands = dpsoftrast.commandpool.usedcommands;
915         freecommand -= size;
916         if (freecommand < 0)
917                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
918         usedcommands -= size;
919         dpsoftrast.commandpool.freecommand = freecommand;
920         dpsoftrast.commandpool.usedcommands = usedcommands;
921 }
922                 
923 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
924 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
925 {
926         thread->viewport[0] = command->x;
927         thread->viewport[1] = command->y;
928         thread->viewport[2] = command->width;
929         thread->viewport[3] = command->height;
930         thread->validate |= DPSOFTRAST_VALIDATE_FB;
931 }
932 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
933 {
934         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
935         command->x = x;
936         command->y = y;
937         command->width = width;
938         command->height = height;
939
940         dpsoftrast.viewport[0] = x;
941         dpsoftrast.viewport[1] = y;
942         dpsoftrast.viewport[2] = width;
943         dpsoftrast.viewport[3] = height;
944         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
945 }
946
947 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
948 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
949 {
950         int i, x1, y1, x2, y2, w, h, x, y;
951         int miny1, maxy1, miny2, maxy2;
952         int bandy;
953         unsigned int *p;
954         unsigned int c;
955         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
956         miny1 = thread->miny1;
957         maxy1 = thread->maxy1;
958         miny2 = thread->miny2;
959         maxy2 = thread->maxy2;
960         x1 = thread->fb_scissor[0];
961         y1 = thread->fb_scissor[1];
962         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
963         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
964         if (y1 < miny1) y1 = miny1;
965         if (y2 > maxy2) y2 = maxy2;
966         w = x2 - x1;
967         h = y2 - y1;
968         if (w < 1 || h < 1)
969                 return;
970         // FIXME: honor fb_colormask?
971         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
972         for (i = 0;i < 4;i++)
973         {
974                 if (!dpsoftrast.fb_colorpixels[i])
975                         continue;
976                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
977                 for (;y < bandy;y++)
978                 {
979                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
980                         for (x = x1;x < x2;x++)
981                                 p[x] = c;
982                 }
983         }
984 }
985 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
986 {
987         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
988         command->r = r;
989         command->g = g;
990         command->b = b;
991         command->a = a;
992 }
993
994 DEFCOMMAND(3, ClearDepth, float depth;)
995 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
996 {
997         int x1, y1, x2, y2, w, h, x, y;
998         int miny1, maxy1, miny2, maxy2;
999         int bandy;
1000         unsigned int *p;
1001         unsigned int c;
1002         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1003         miny1 = thread->miny1;
1004         maxy1 = thread->maxy1;
1005         miny2 = thread->miny2;
1006         maxy2 = thread->maxy2;
1007         x1 = thread->fb_scissor[0];
1008         y1 = thread->fb_scissor[1];
1009         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1010         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1011         if (y1 < miny1) y1 = miny1;
1012         if (y2 > maxy2) y2 = maxy2;
1013         w = x2 - x1;
1014         h = y2 - y1;
1015         if (w < 1 || h < 1)
1016                 return;
1017         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1018         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1019         for (;y < bandy;y++)
1020         {
1021                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1022                 for (x = x1;x < x2;x++)
1023                         p[x] = c;
1024         }
1025 }
1026 void DPSOFTRAST_ClearDepth(float d)
1027 {
1028         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1029         command->depth = d;
1030 }
1031
1032 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1033 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1034 {
1035         thread->colormask[0] = command->r != 0;
1036         thread->colormask[1] = command->g != 0;
1037         thread->colormask[2] = command->b != 0;
1038         thread->colormask[3] = command->a != 0;
1039         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1040 }
1041 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1042 {
1043         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1044         command->r = r;
1045         command->g = g;
1046         command->b = b;
1047         command->a = a;
1048 }
1049
1050 DEFCOMMAND(5, DepthTest, int enable;)
1051 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1052 {
1053         thread->depthtest = command->enable;
1054         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1055 }
1056 void DPSOFTRAST_DepthTest(int enable)
1057 {
1058         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1059         command->enable = enable;
1060 }
1061
1062 DEFCOMMAND(6, ScissorTest, int enable;)
1063 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1064 {
1065         thread->scissortest = command->enable;
1066         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1067 }
1068 void DPSOFTRAST_ScissorTest(int enable)
1069 {
1070         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1071         command->enable = enable;
1072 }
1073
1074 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1075 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1076 {
1077         thread->scissor[0] = command->x;
1078         thread->scissor[1] = command->y;
1079         thread->scissor[2] = command->width;
1080         thread->scissor[3] = command->height;
1081         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1082 }
1083 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1084 {
1085         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1086         command->x = x;
1087         command->y = y;
1088         command->width = width;
1089         command->height = height;
1090 }
1091
1092 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1093 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1094 {
1095         thread->blendfunc[0] = command->sfactor;
1096         thread->blendfunc[1] = command->dfactor;
1097         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1098 }
1099 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1100 {
1101         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1102         command->sfactor = sfactor;
1103         command->dfactor = dfactor;
1104 }
1105
1106 DEFCOMMAND(9, BlendSubtract, int enable;)
1107 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1108 {
1109         thread->blendsubtract = command->enable;
1110         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1111 }
1112 void DPSOFTRAST_BlendSubtract(int enable)
1113 {
1114         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1115         command->enable = enable;
1116 }
1117
1118 DEFCOMMAND(10, DepthMask, int enable;)
1119 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1120 {
1121         thread->depthmask = command->enable;
1122 }
1123 void DPSOFTRAST_DepthMask(int enable)
1124 {
1125         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1126         command->enable = enable;
1127 }
1128
1129 DEFCOMMAND(11, DepthFunc, int func;)
1130 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1131 {
1132         thread->depthfunc = command->func;
1133 }
1134 void DPSOFTRAST_DepthFunc(int func)
1135 {
1136         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1137         command->func = func;
1138 }
1139
1140 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1141 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1142 {
1143         thread->depthrange[0] = command->nearval;
1144         thread->depthrange[1] = command->farval;
1145 }
1146 void DPSOFTRAST_DepthRange(float nearval, float farval)
1147 {
1148         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1149         command->nearval = nearval;
1150         command->farval = farval;
1151 }
1152
1153 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1154 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1155 {
1156         thread->polygonoffset[0] = command->alongnormal;
1157         thread->polygonoffset[1] = command->intoview;
1158 }
1159 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1160 {
1161         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1162         command->alongnormal = alongnormal;
1163         command->intoview = intoview;
1164 }
1165
1166 DEFCOMMAND(14, CullFace, int mode;)
1167 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1168 {
1169         thread->cullface = command->mode;
1170 }
1171 void DPSOFTRAST_CullFace(int mode)
1172 {
1173         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1174         command->mode = mode;
1175 }
1176
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1178 {
1179         dpsoftrast.color[0] = r;
1180         dpsoftrast.color[1] = g;
1181         dpsoftrast.color[2] = b;
1182         dpsoftrast.color[3] = a;
1183 }
1184
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1186 {
1187         int outstride = blockwidth * 4;
1188         int instride = dpsoftrast.fb_width * 4;
1189         int bx1 = blockx;
1190         int by1 = blocky;
1191         int bx2 = blockx + blockwidth;
1192         int by2 = blocky + blockheight;
1193         int bw;
1194         int x;
1195         int y;
1196         unsigned char *inpixels;
1197         unsigned char *b;
1198         unsigned char *o;
1199         DPSOFTRAST_Flush();
1200         if (bx1 < 0) bx1 = 0;
1201         if (by1 < 0) by1 = 0;
1202         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1204         bw = bx2 - bx1;
1205         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206         if (dpsoftrast.bigendian)
1207         {
1208                 for (y = by1;y < by2;y++)
1209                 {
1210                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1212                         for (x = bx1;x < bx2;x++)
1213                         {
1214                                 o[0] = b[3];
1215                                 o[1] = b[2];
1216                                 o[2] = b[1];
1217                                 o[3] = b[0];
1218                                 o += 4;
1219                                 b += 4;
1220                         }
1221                 }
1222         }
1223         else
1224         {
1225                 for (y = by1;y < by2;y++)
1226                 {
1227                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1229                         memcpy(o, b, bw*4);
1230                 }
1231         }
1232
1233 }
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1235 {
1236         int tx1 = tx;
1237         int ty1 = ty;
1238         int tx2 = tx + width;
1239         int ty2 = ty + height;
1240         int sx1 = sx;
1241         int sy1 = sy;
1242         int sx2 = sx + width;
1243         int sy2 = sy + height;
1244         int swidth;
1245         int sheight;
1246         int twidth;
1247         int theight;
1248         int sw;
1249         int sh;
1250         int tw;
1251         int th;
1252         int y;
1253         unsigned int *spixels;
1254         unsigned int *tpixels;
1255         DPSOFTRAST_Texture *texture;
1256         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257         if (mip < 0 || mip >= texture->mipmaps) return;
1258         DPSOFTRAST_Flush();
1259         spixels = dpsoftrast.fb_colorpixels[0];
1260         swidth = dpsoftrast.fb_width;
1261         sheight = dpsoftrast.fb_height;
1262         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263         twidth = texture->mipmap[mip][2];
1264         theight = texture->mipmap[mip][3];
1265         if (tx1 < 0) tx1 = 0;
1266         if (ty1 < 0) ty1 = 0;
1267         if (tx2 > twidth) tx2 = twidth;
1268         if (ty2 > theight) ty2 = theight;
1269         if (sx1 < 0) sx1 = 0;
1270         if (sy1 < 0) sy1 = 0;
1271         if (sx2 > swidth) sx2 = swidth;
1272         if (sy2 > sheight) sy2 = sheight;
1273         tw = tx2 - tx1;
1274         th = ty2 - ty1;
1275         sw = sx2 - sx1;
1276         sh = sy2 - sy1;
1277         if (tw > sw) tw = sw;
1278         if (th > sh) th = sh;
1279         if (tw < 1 || th < 1)
1280                 return;
1281         sy1 = sheight - sy1 - th;
1282         ty1 = theight - ty1 - th;
1283         for (y = 0;y < th;y++)
1284                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1285         if (texture->mipmaps > 1)
1286                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1287 }
1288
1289 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1290 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1291 {
1292         if (thread->texbound[command->unitnum])
1293                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1294         thread->texbound[command->unitnum] = command->texture;
1295 }
1296 void DPSOFTRAST_SetTexture(int unitnum, int index)
1297 {
1298         DPSOFTRAST_Command_SetTexture *command;
1299         DPSOFTRAST_Texture *texture;
1300         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1301         {
1302                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1303                 return;
1304         }
1305         texture = DPSOFTRAST_Texture_GetByIndex(index);
1306         if (index && !texture)
1307         {
1308                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1309                 return;
1310         }
1311
1312         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1313         command->unitnum = unitnum;
1314         command->texture = texture;
1315
1316         dpsoftrast.texbound[unitnum] = texture;
1317         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1318 }
1319
1320 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1321 {
1322         dpsoftrast.pointer_vertex3f = vertex3f;
1323         dpsoftrast.stride_vertex = stride;
1324 }
1325 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1326 {
1327         dpsoftrast.pointer_color4f = color4f;
1328         dpsoftrast.pointer_color4ub = NULL;
1329         dpsoftrast.stride_color = stride;
1330 }
1331 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1332 {
1333         dpsoftrast.pointer_color4f = NULL;
1334         dpsoftrast.pointer_color4ub = color4ub;
1335         dpsoftrast.stride_color = stride;
1336 }
1337 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1338 {
1339         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1340         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1341         dpsoftrast.stride_texcoord[unitnum] = stride;
1342 }
1343
1344 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1345 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1346 {
1347         thread->shader_mode = command->mode;
1348         thread->shader_permutation = command->permutation;
1349         thread->shader_exactspecularmath = command->exactspecularmath;
1350 }
1351 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1352 {
1353         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1354         command->mode = mode;
1355         command->permutation = permutation;
1356         command->exactspecularmath = exactspecularmath;
1357
1358         dpsoftrast.shader_mode = mode;
1359         dpsoftrast.shader_permutation = permutation;
1360         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1361 }
1362
1363 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1364 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1365 {
1366         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1367 }
1368 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1369 {
1370         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1371         command->index = index;
1372         command->val[0] = v0;
1373         command->val[1] = v1;
1374         command->val[2] = v2;
1375         command->val[3] = v3;
1376
1377         dpsoftrast.uniform4f[index*4+0] = v0;
1378         dpsoftrast.uniform4f[index*4+1] = v1;
1379         dpsoftrast.uniform4f[index*4+2] = v2;
1380         dpsoftrast.uniform4f[index*4+3] = v3;
1381 }
1382 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1383 {
1384         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1385         command->index = index;
1386         memcpy(command->val, v, sizeof(command->val));
1387
1388         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1389 }
1390
1391 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1392 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1393 {
1394         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1395 }
1396 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1397 {
1398 #ifdef SSE_POSSIBLE
1399         int i, index;
1400         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1401         {
1402                 __m128 m0, m1, m2, m3;
1403                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1404                 command->index = (DPSOFTRAST_UNIFORM)index;
1405                 if (((size_t)v)&(ALIGN_SIZE-1))
1406                 {
1407                         m0 = _mm_loadu_ps(v);
1408                         m1 = _mm_loadu_ps(v+4);
1409                         m2 = _mm_loadu_ps(v+8);
1410                         m3 = _mm_loadu_ps(v+12);
1411                 }
1412                 else
1413                 {
1414                         m0 = _mm_load_ps(v);
1415                         m1 = _mm_load_ps(v+4);
1416                         m2 = _mm_load_ps(v+8);
1417                         m3 = _mm_load_ps(v+12);
1418                 }
1419                 if (transpose)
1420                 {
1421                         __m128 t0, t1, t2, t3;
1422                         t0 = _mm_unpacklo_ps(m0, m1);
1423                         t1 = _mm_unpacklo_ps(m2, m3);
1424                         t2 = _mm_unpackhi_ps(m0, m1);
1425                         t3 = _mm_unpackhi_ps(m2, m3);
1426                         m0 = _mm_movelh_ps(t0, t1);
1427                         m1 = _mm_movehl_ps(t1, t0);
1428                         m2 = _mm_movelh_ps(t2, t3);
1429                         m3 = _mm_movehl_ps(t3, t2);                     
1430                 }
1431                 _mm_store_ps(command->val, m0);
1432                 _mm_store_ps(command->val+4, m1);
1433                 _mm_store_ps(command->val+8, m2);
1434                 _mm_store_ps(command->val+12, m3);
1435                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1436                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1437                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1438                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1439         }
1440 #endif
1441 }
1442
1443 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1444 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1445 {
1446         thread->uniform1i[command->index] = command->val;
1447 }
1448 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1449 {
1450         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1451         command->index = index;
1452         command->val = i0;
1453
1454         dpsoftrast.uniform1i[command->index] = i0;
1455 }
1456
1457 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1458 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1459 {
1460         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1461         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1462 }
1463 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1464 {
1465         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1466         command->clipplane[0] = x;
1467         command->clipplane[1] = y;
1468         command->clipplane[2] = z;
1469         command->clipplane[3] = w;
1470 }
1471
1472 #ifdef SSE_POSSIBLE
1473 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1474 {
1475         float *end = dst + size*4;
1476         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1477         {
1478                 while (dst < end)
1479                 {
1480                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1481                         dst += 4;
1482                         src += stride;
1483                 }
1484         }
1485         else
1486         {
1487                 while (dst < end)
1488                 {
1489                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1490                         dst += 4;
1491                         src += stride;
1492                 }
1493         }
1494 }
1495
1496 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1497 {
1498         float *end = dst + size*4;
1499         if (stride == sizeof(float[3]))
1500         {
1501                 float *end4 = dst + (size&~3)*4;        
1502                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1503                 {
1504                         while (dst < end4)
1505                         {
1506                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1507                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1508                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1509                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1510                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1511                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1512                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1513                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1514                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1515                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1516                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1517                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1518                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1519                                 dst += 16;
1520                                 src += 4*sizeof(float[3]);
1521                         }
1522                 }
1523                 else
1524                 {
1525                         while (dst < end4)
1526                         {
1527                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1528                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1529                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1530                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1531                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1532                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1533                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1534                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1535                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1536                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1537                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1538                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1539                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1540                                 dst += 16;
1541                                 src += 4*sizeof(float[3]);
1542                         }
1543                 }
1544         }
1545         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1546         {
1547                 while (dst < end)
1548                 {
1549                         __m128 v = _mm_loadu_ps((const float *)src);
1550                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1551                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1552                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1553                         _mm_store_ps(dst, v);
1554                         dst += 4;
1555                         src += stride;
1556                 }
1557         }
1558         else
1559         {
1560                 while (dst < end)
1561                 {
1562                         __m128 v = _mm_load_ps((const float *)src);
1563                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1564                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1565                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1566                         _mm_store_ps(dst, v);
1567                         dst += 4;
1568                         src += stride;
1569                 }
1570         }
1571 }
1572
1573 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1574 {
1575         float *end = dst + size*4;
1576         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1577         if (stride == sizeof(float[2]))
1578         {
1579                 float *end2 = dst + (size&~1)*4;
1580                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1581                 {
1582                         while (dst < end2)
1583                         {
1584                                 __m128 v = _mm_loadu_ps((const float *)src);
1585                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1586                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1587                                 dst += 8;
1588                                 src += 2*sizeof(float[2]);
1589                         }
1590                 }
1591                 else
1592                 {
1593                         while (dst < end2)
1594                         {
1595                                 __m128 v = _mm_load_ps((const float *)src);
1596                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1597                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1598                                 dst += 8;
1599                                 src += 2*sizeof(float[2]);
1600                         }
1601                 }
1602         }
1603         while (dst < end)
1604         {
1605                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1606                 dst += 4;
1607                 src += stride;
1608         }
1609 }
1610
1611 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1612 {
1613         float *end = dst + size*4;
1614         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1615         if (stride == sizeof(unsigned char[4]))
1616         {
1617                 float *end4 = dst + (size&~3)*4;
1618                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1619                 {
1620                         while (dst < end4)
1621                         {
1622                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1623                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1624                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1625                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1626                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1627                                 dst += 16;
1628                                 src += 4*sizeof(unsigned char[4]);
1629                         }
1630                 }
1631                 else
1632                 {
1633                         while (dst < end4)
1634                         {
1635                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1636                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1637                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1638                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1639                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1640                                 dst += 16;
1641                                 src += 4*sizeof(unsigned char[4]);
1642                         }
1643                 }
1644         }
1645         while (dst < end)
1646         {
1647                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1648                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1649                 dst += 4;
1650                 src += stride;
1651         }
1652 }
1653
1654 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1655 {
1656         float *end = dst + 4*size;
1657         __m128 v = _mm_loadu_ps(src);
1658         while (dst < end)
1659         {
1660                 _mm_store_ps(dst, v);
1661                 dst += 4;
1662         }
1663 }
1664 #endif
1665
1666 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1667 {
1668 #ifdef SSE_POSSIBLE
1669         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1670         __m128 m0, m1, m2, m3;
1671         float *end;
1672         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1673         {
1674                 // fast case for identity matrix
1675                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1676                 return;
1677         }
1678         end = out4f + numitems*4;
1679         m0 = _mm_loadu_ps(inmatrix16f);
1680         m1 = _mm_loadu_ps(inmatrix16f + 4);
1681         m2 = _mm_loadu_ps(inmatrix16f + 8);
1682         m3 = _mm_loadu_ps(inmatrix16f + 12);
1683         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1684         {
1685                 while (out4f < end)
1686                 {
1687                         __m128 v = _mm_loadu_ps(in4f);
1688                         _mm_store_ps(out4f,
1689                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1690                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1691                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1692                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1693                         out4f += 4;
1694                         in4f += 4;
1695                 }
1696         }
1697         else
1698         {
1699                 while (out4f < end)
1700                 {
1701                         __m128 v = _mm_load_ps(in4f);
1702                         _mm_store_ps(out4f,
1703                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1704                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1705                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1706                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1707                         out4f += 4;
1708                         in4f += 4;
1709                 }
1710         }
1711 #endif
1712 }
1713
1714 #if 0
1715 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1716 {
1717         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1718 }
1719 #endif
1720
1721 #ifdef SSE_POSSIBLE
1722 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1723 { \
1724         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1725         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1726         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1727         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1728 }
1729
1730 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1731 { \
1732         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1733         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1734         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1735         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1736 }
1737
1738 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1739 { \
1740         __m128 p = (in); \
1741         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1742                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1743                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1744                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1745 }
1746
1747 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1748 {
1749         int clipmask = 0xFF;
1750         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1751         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1752         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1753         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1754         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1755         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1756         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1757         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1758         #define BBFRONT(k, pos) \
1759         { \
1760                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1761                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1762                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1763                 { \
1764                         __m128 proj; \
1765                         clipmask &= ~(1<<k); \
1766                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1767                         minproj = _mm_min_ss(minproj, proj); \
1768                         maxproj = _mm_max_ss(maxproj, proj); \
1769                 } \
1770         }
1771         BBFRONT(0, minpos); 
1772         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1773         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1774         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1775         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1776         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1777         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1778         BBFRONT(7, maxpos);
1779         #define BBCLIP(k) \
1780         { \
1781                 if (clipmask&(1<<k)) \
1782                 { \
1783                         if (!(clipmask&(1<<(k^1)))) \
1784                         { \
1785                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1786                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1787                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1788                                 minproj = _mm_min_ss(minproj, proj); \
1789                                 maxproj = _mm_max_ss(maxproj, proj); \
1790                         } \
1791                         if (!(clipmask&(1<<(k^2)))) \
1792                         { \
1793                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1794                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1795                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1796                                 minproj = _mm_min_ss(minproj, proj); \
1797                                 maxproj = _mm_max_ss(maxproj, proj); \
1798                         } \
1799                         if (!(clipmask&(1<<(k^4)))) \
1800                         { \
1801                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1802                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1803                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1804                                 minproj = _mm_min_ss(minproj, proj); \
1805                                 maxproj = _mm_max_ss(maxproj, proj); \
1806                         } \
1807                 } \
1808         }
1809         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1810         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1811         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1812         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1813         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1814         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1815         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1816         *starty = _mm_cvttss_si32(maxproj);
1817         *endy = _mm_cvttss_si32(minproj)+1;
1818         return clipmask;
1819 }
1820         
1821 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1822 {
1823         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1824         float *end = out4f + numitems*4;
1825         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1826         __m128 minpos, maxpos;
1827         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1828         {
1829                 minpos = maxpos = _mm_loadu_ps(in4f);
1830                 while (out4f < end)
1831                 {
1832                         __m128 v = _mm_loadu_ps(in4f);
1833                         minpos = _mm_min_ps(minpos, v);
1834                         maxpos = _mm_max_ps(maxpos, v);
1835                         _mm_store_ps(out4f, v);
1836                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1837                         _mm_store_ps(screen4f, v);
1838                         in4f += 4;
1839                         out4f += 4;
1840                         screen4f += 4;
1841                 }
1842         }
1843         else
1844         {
1845                 minpos = maxpos = _mm_load_ps(in4f);
1846                 while (out4f < end)
1847                 {
1848                         __m128 v = _mm_load_ps(in4f);
1849                         minpos = _mm_min_ps(minpos, v);
1850                         maxpos = _mm_max_ps(maxpos, v);
1851                         _mm_store_ps(out4f, v);
1852                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1853                         _mm_store_ps(screen4f, v);
1854                         in4f += 4;
1855                         out4f += 4;
1856                         screen4f += 4;
1857                 }
1858         }
1859         if (starty && endy) 
1860         {
1861                 ALIGN(float minposf[4]);
1862                 ALIGN(float maxposf[4]);
1863                 _mm_store_ps(minposf, minpos);
1864                 _mm_store_ps(maxposf, maxpos);
1865                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1866         }
1867         return 0;
1868 }
1869
1870 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1871 {
1872         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1873         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1874         float *end;
1875         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1876                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1877         end = out4f + numitems*4;
1878         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1879         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1880         m0 = _mm_loadu_ps(inmatrix16f);
1881         m1 = _mm_loadu_ps(inmatrix16f + 4);
1882         m2 = _mm_loadu_ps(inmatrix16f + 8);
1883         m3 = _mm_loadu_ps(inmatrix16f + 12);
1884         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1885         {
1886                 minpos = maxpos = _mm_loadu_ps(in4f);
1887                 while (out4f < end)
1888                 {
1889                         __m128 v = _mm_loadu_ps(in4f);
1890                         minpos = _mm_min_ps(minpos, v);
1891                         maxpos = _mm_max_ps(maxpos, v);
1892                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1893                         _mm_store_ps(out4f, v);
1894                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1895                         _mm_store_ps(screen4f, v);
1896                         in4f += 4;
1897                         out4f += 4;
1898                         screen4f += 4;
1899                 }
1900         }
1901         else
1902         {
1903                 minpos = maxpos = _mm_load_ps(in4f);
1904                 while (out4f < end)
1905                 {
1906                         __m128 v = _mm_load_ps(in4f);
1907                         minpos = _mm_min_ps(minpos, v);
1908                         maxpos = _mm_max_ps(maxpos, v);
1909                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1910                         _mm_store_ps(out4f, v);
1911                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1912                         _mm_store_ps(screen4f, v);
1913                         in4f += 4;
1914                         out4f += 4;
1915                         screen4f += 4;
1916                 }
1917         }
1918         if (starty && endy) 
1919         {
1920                 ALIGN(float minposf[4]);
1921                 ALIGN(float maxposf[4]);
1922                 _mm_store_ps(minposf, minpos);
1923                 _mm_store_ps(maxposf, maxpos);
1924                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1925         }
1926         return 0;
1927 }
1928 #endif
1929
1930 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1931 {
1932 #ifdef SSE_POSSIBLE
1933         float *outf = dpsoftrast.post_array4f[outarray];
1934         const unsigned char *inb;
1935         int firstvertex = dpsoftrast.firstvertex;
1936         int numvertices = dpsoftrast.numvertices;
1937         int stride;
1938         switch(inarray)
1939         {
1940         case DPSOFTRAST_ARRAY_POSITION:
1941                 stride = dpsoftrast.stride_vertex;
1942                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1943                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1944                 break;
1945         case DPSOFTRAST_ARRAY_COLOR:
1946                 stride = dpsoftrast.stride_color;
1947                 if (dpsoftrast.pointer_color4f)
1948                 {
1949                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1950                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1951                 }
1952                 else if (dpsoftrast.pointer_color4ub)
1953                 {
1954                         stride = dpsoftrast.stride_color;
1955                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1956                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1957                 }
1958                 else
1959                 {
1960                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1961                 }
1962                 break;
1963         default:
1964                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1965                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1966                 {
1967                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1968                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1969                         {
1970                         case 2:
1971                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1972                                 break;
1973                         case 3:
1974                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1975                                 break;
1976                         case 4:
1977                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1978                                 break;
1979                         }
1980                 }
1981                 break;
1982         }
1983         return outf;
1984 #else
1985         return NULL;
1986 #endif
1987 }
1988
1989 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1990 {
1991         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1992         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1993         return data;
1994 }
1995
1996 #if 0
1997 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1998 {
1999 #ifdef SSE_POSSIBLE
2000         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2001         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2002         return data;
2003 #else
2004         return NULL;
2005 #endif
2006 }
2007 #endif
2008
2009 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2010 {
2011 #ifdef SSE_POSSIBLE
2012         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2013         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2014         return data;
2015 #else
2016         return NULL;
2017 #endif
2018 }
2019
2020 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2021 {
2022         int x;
2023         int startx = span->startx;
2024         int endx = span->endx;
2025         float wslope = triangle->w[0];
2026         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2027         float endz = 1.0f / (w + wslope * startx);
2028         if (triangle->w[0] == 0)
2029         {
2030                 // LordHavoc: fast flat polygons (HUD/menu)
2031                 for (x = startx;x < endx;x++)
2032                         zf[x] = endz;
2033                 return;
2034         }
2035         for (x = startx;x < endx;)
2036         {
2037                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2038                 float z = endz, dz;
2039                 if (nextsub >= endx) nextsub = endsub = endx-1;
2040                 endz = 1.0f / (w + wslope * nextsub);
2041                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2042                 for (; x <= endsub; x++, z += dz)
2043                         zf[x] = z;
2044         }
2045 }
2046
2047 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2048 {
2049 #ifdef SSE_POSSIBLE
2050         int x;
2051         int startx = span->startx;
2052         int endx = span->endx;
2053         int maskx;
2054         int subx;
2055         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2056         unsigned char * RESTRICT pixelmask = span->pixelmask;
2057         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2058         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2059         if (!pixel)
2060                 return;
2061         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2062         pixeli += span->y * dpsoftrast.fb_width + span->x;
2063         // handle alphatest now (this affects depth writes too)
2064         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2065                 for (x = startx;x < endx;x++)
2066                         if (in4ub[x*4+3] < 128)
2067                                 pixelmask[x] = false;
2068         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2069         // helps sprites, text and hud artwork
2070         switch(thread->fb_blendmode)
2071         {
2072         case DPSOFTRAST_BLENDMODE_ALPHA:
2073         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2074         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2075                 maskx = startx;
2076                 for (x = startx;x < endx;x++)
2077                 {
2078                         if (in4ub[x*4+3] >= 1)
2079                         {
2080                                 startx = x;
2081                                 for (;;)
2082                                 {
2083                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2084                                         maskx = x;
2085                                         if (x >= endx) break;
2086                                         ++x;
2087                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2088                                         if (x >= endx) break;
2089                                 }
2090                                 break;
2091                         }
2092                 }
2093                 endx = maskx;
2094                 break;
2095         case DPSOFTRAST_BLENDMODE_OPAQUE:
2096         case DPSOFTRAST_BLENDMODE_ADD:
2097         case DPSOFTRAST_BLENDMODE_INVMOD:
2098         case DPSOFTRAST_BLENDMODE_MUL:
2099         case DPSOFTRAST_BLENDMODE_MUL2:
2100         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2101         case DPSOFTRAST_BLENDMODE_INVADD:
2102                 break;
2103         }
2104         // put some special values at the end of the mask to ensure the loops end
2105         pixelmask[endx] = 1;
2106         pixelmask[endx+1] = 0;
2107         // LordHavoc: use a double loop to identify subspans, this helps the
2108         // optimized copy/blend loops to perform at their best, most triangles
2109         // have only one run of pixels, and do the search using wide reads...
2110         x = startx;
2111         while (x < endx)
2112         {
2113                 // if this pixel is masked off, it's probably not alone...
2114                 if (!pixelmask[x])
2115                 {
2116                         x++;
2117 #if 1
2118                         if (x + 8 < endx)
2119                         {
2120                                 // the 4-item search must be aligned or else it stalls badly
2121                                 if ((x & 3) && !pixelmask[x]) 
2122                                 {
2123                                         if(pixelmask[x]) goto endmasked;
2124                                         x++;
2125                                         if (x & 3)
2126                                         {
2127                                                 if(pixelmask[x]) goto endmasked;
2128                                                 x++;
2129                                                 if (x & 3)
2130                                                 {
2131                                                         if(pixelmask[x]) goto endmasked;
2132                                                         x++;
2133                                                 }
2134                                         }
2135                                 }
2136                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2137                                         x += 4;
2138                         }
2139 #endif
2140                         for (;!pixelmask[x];x++)
2141                                 ;
2142                         // rather than continue the loop, just check the end variable
2143                         if (x >= endx)
2144                                 break;
2145                 }
2146         endmasked:
2147                 // find length of subspan
2148                 subx = x + 1;
2149 #if 1
2150                 if (subx + 8 < endx)
2151                 {
2152                         if (subx & 3)
2153                         {
2154                                 if(!pixelmask[subx]) goto endunmasked;
2155                                 subx++;
2156                                 if (subx & 3)
2157                                 {
2158                                         if(!pixelmask[subx]) goto endunmasked;
2159                                         subx++;
2160                                         if (subx & 3)
2161                                         {
2162                                                 if(!pixelmask[subx]) goto endunmasked;
2163                                                 subx++;
2164                                         }
2165                                 }
2166                         }
2167                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2168                                 subx += 4;
2169                 }
2170 #endif
2171                 for (;pixelmask[subx];subx++)
2172                         ;
2173                 // the checks can overshoot, so make sure to clip it...
2174                 if (subx > endx)
2175                         subx = endx;
2176         endunmasked:
2177                 // now that we know the subspan length...  process!
2178                 switch(thread->fb_blendmode)
2179                 {
2180                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2181 #if 0
2182                         if (subx - x >= 16)
2183                         {
2184                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2185                                 x = subx;
2186                         }
2187                         else
2188 #elif 1
2189                         while (x + 16 <= subx)
2190                         {
2191                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2192                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2193                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2194                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2195                                 x += 16;
2196                         }
2197 #endif
2198                         {
2199                                 while (x + 4 <= subx)
2200                                 {
2201                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2202                                         x += 4;
2203                                 }
2204                                 if (x + 2 <= subx)
2205                                 {
2206                                         pixeli[x] = ini[x];
2207                                         pixeli[x+1] = ini[x+1];
2208                                         x += 2;
2209                                 }
2210                                 if (x < subx)
2211                                 {
2212                                         pixeli[x] = ini[x];
2213                                         x++;
2214                                 }
2215                         }
2216                         break;
2217                 case DPSOFTRAST_BLENDMODE_ALPHA:
2218                 #define FINISHBLEND(blend2, blend1) \
2219                         for (;x + 1 < subx;x += 2) \
2220                         { \
2221                                 __m128i src, dst; \
2222                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2223                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2224                                 blend2; \
2225                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2226                         } \
2227                         if (x < subx) \
2228                         { \
2229                                 __m128i src, dst; \
2230                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2231                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2232                                 blend1; \
2233                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2234                                 x++; \
2235                         }
2236                         FINISHBLEND({
2237                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2238                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2239                         }, {
2240                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2241                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2242                         });
2243                         break;
2244                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2245                         FINISHBLEND({
2246                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2247                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2248                         }, {
2249                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2250                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2251                         });
2252                         break;
2253                 case DPSOFTRAST_BLENDMODE_ADD:
2254                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2255                         break;
2256                 case DPSOFTRAST_BLENDMODE_INVMOD:
2257                         FINISHBLEND({
2258                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2259                         }, {
2260                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2261                         });
2262                         break;
2263                 case DPSOFTRAST_BLENDMODE_MUL:
2264                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2265                         break;
2266                 case DPSOFTRAST_BLENDMODE_MUL2:
2267                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2268                         break;
2269                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2270                         FINISHBLEND({
2271                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2272                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2273                         }, {
2274                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2275                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2276                         });
2277                         break;
2278                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2279                         FINISHBLEND({
2280                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2281                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2282                         }, {
2283                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2284                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2285                         });
2286                         break;
2287                 case DPSOFTRAST_BLENDMODE_INVADD:
2288                         FINISHBLEND({
2289                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2290                         }, {
2291                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2292                         });
2293                         break;
2294                 }
2295         }
2296 #endif
2297 }
2298
2299 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2300         // warning: this is SLOW, only use if the optimized per-span functions won't do
2301 {
2302         const unsigned char * RESTRICT pixelbase;
2303         const unsigned char * RESTRICT pixel[4];
2304         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2305         int wrapmask[2] = { width-1, height-1 };
2306         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2307         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2308         {
2309                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2310                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2311                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2312                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2313                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2314                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2315                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2316                 {
2317                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2318                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2319                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2320                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2321                 }
2322                 else
2323                 {
2324                         tci[0] &= wrapmask[0];
2325                         tci[1] &= wrapmask[1];
2326                         tci1[0] &= wrapmask[0];
2327                         tci1[1] &= wrapmask[1];
2328                 }
2329                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2330                 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2331                 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2332                 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2333                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2334                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2335                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2336                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2337         }
2338         else
2339         {
2340                 int tci[2] = { x * width, y * height };
2341                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2342                 {
2343                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2344                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2345                 }
2346                 else
2347                 {
2348                         tci[0] &= wrapmask[0];
2349                         tci[1] &= wrapmask[1];
2350                 }
2351                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2352                 c[0] = pixel[0][0];
2353                 c[1] = pixel[0][1];
2354                 c[2] = pixel[0][2];
2355                 c[3] = pixel[0][3];
2356         }
2357 }
2358
2359 #if 0
2360 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2361 {
2362         int x;
2363         int startx = span->startx;
2364         int endx = span->endx;
2365         int flags;
2366         float c[4];
2367         float data[4];
2368         float slope[4];
2369         float tc[2], endtc[2];
2370         float tcscale[2];
2371         unsigned int tci[2];
2372         unsigned int tci1[2];
2373         unsigned int tcimin[2];
2374         unsigned int tcimax[2];
2375         int tciwrapmask[2];
2376         int tciwidth;
2377         int filter;
2378         int mip;
2379         const unsigned char * RESTRICT pixelbase;
2380         const unsigned char * RESTRICT pixel[4];
2381         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2382         // if no texture is bound, just fill it with white
2383         if (!texture)
2384         {
2385                 for (x = startx;x < endx;x++)
2386                 {
2387                         out4f[x*4+0] = 1.0f;
2388                         out4f[x*4+1] = 1.0f;
2389                         out4f[x*4+2] = 1.0f;
2390                         out4f[x*4+3] = 1.0f;
2391                 }
2392                 return;
2393         }
2394         mip = triangle->mip[texunitindex];
2395         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2396         // if this mipmap of the texture is 1 pixel, just fill it with that color
2397         if (texture->mipmap[mip][1] == 4)
2398         {
2399                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2400                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2401                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2402                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2403                 for (x = startx;x < endx;x++)
2404                 {
2405                         out4f[x*4+0] = c[0];
2406                         out4f[x*4+1] = c[1];
2407                         out4f[x*4+2] = c[2];
2408                         out4f[x*4+3] = c[3];
2409                 }
2410                 return;
2411         }
2412         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2413         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2414         flags = texture->flags;
2415         tcscale[0] = texture->mipmap[mip][2];
2416         tcscale[1] = texture->mipmap[mip][3];
2417         tciwidth = -texture->mipmap[mip][2];
2418         tcimin[0] = 0;
2419         tcimin[1] = 0;
2420         tcimax[0] = texture->mipmap[mip][2]-1;
2421         tcimax[1] = texture->mipmap[mip][3]-1;
2422         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2423         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2424         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2425         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2426         if (filter)
2427         {
2428                 endtc[0] -= 0.5f;
2429                 endtc[1] -= 0.5f;
2430         }
2431         for (x = startx;x < endx;)
2432         {
2433                 unsigned int subtc[2];
2434                 unsigned int substep[2];
2435                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2436                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2437                 if (nextsub >= endx)
2438                 {
2439                         nextsub = endsub = endx-1;      
2440                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2441                 }
2442                 tc[0] = endtc[0];
2443                 tc[1] = endtc[1];
2444                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2445                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2446                 if (filter)
2447                 {
2448                         endtc[0] -= 0.5f;
2449                         endtc[1] -= 0.5f;
2450                 }
2451                 substep[0] = (endtc[0] - tc[0]) * subscale;
2452                 substep[1] = (endtc[1] - tc[1]) * subscale;
2453                 subtc[0] = tc[0] * (1<<12);
2454                 subtc[1] = tc[1] * (1<<12);
2455                 if (filter)
2456                 {
2457                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2458                         {
2459                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2460                                 {
2461                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2462                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2463                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2464                                         tci[0] = subtc[0]>>12;
2465                                         tci[1] = subtc[1]>>12;
2466                                         tci1[0] = tci[0] + 1;
2467                                         tci1[1] = tci[1] + 1;
2468                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2469                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2470                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2471                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2472                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2473                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2474                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2475                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2476                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2477                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2478                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2479                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2480                                         out4f[x*4+0] = c[0];
2481                                         out4f[x*4+1] = c[1];
2482                                         out4f[x*4+2] = c[2];
2483                                         out4f[x*4+3] = c[3];
2484                                 }
2485                         }
2486                         else
2487                         {
2488                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2489                                 {
2490                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2491                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2492                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2493                                         tci[0] = subtc[0]>>12;
2494                                         tci[1] = subtc[1]>>12;
2495                                         tci1[0] = tci[0] + 1;
2496                                         tci1[1] = tci[1] + 1;
2497                                         tci[0] &= tciwrapmask[0];
2498                                         tci[1] &= tciwrapmask[1];
2499                                         tci1[0] &= tciwrapmask[0];
2500                                         tci1[1] &= tciwrapmask[1];
2501                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2502                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2503                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2504                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2505                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2506                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2507                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2508                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2509                                         out4f[x*4+0] = c[0];
2510                                         out4f[x*4+1] = c[1];
2511                                         out4f[x*4+2] = c[2];
2512                                         out4f[x*4+3] = c[3];
2513                                 }
2514                         }
2515                 }
2516                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2517                 {
2518                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2519                         {
2520                                 tci[0] = subtc[0]>>12;
2521                                 tci[1] = subtc[1]>>12;
2522                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2523                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2524                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2525                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2526                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2527                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2528                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2529                                 out4f[x*4+0] = c[0];
2530                                 out4f[x*4+1] = c[1];
2531                                 out4f[x*4+2] = c[2];
2532                                 out4f[x*4+3] = c[3];
2533                         }
2534                 }
2535                 else
2536                 {
2537                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2538                         {
2539                                 tci[0] = subtc[0]>>12;
2540                                 tci[1] = subtc[1]>>12;
2541                                 tci[0] &= tciwrapmask[0];
2542                                 tci[1] &= tciwrapmask[1];
2543                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2544                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2545                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2546                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2547                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2548                                 out4f[x*4+0] = c[0];
2549                                 out4f[x*4+1] = c[1];
2550                                 out4f[x*4+2] = c[2];
2551                                 out4f[x*4+3] = c[3];
2552                         }
2553                 }
2554         }
2555 }
2556 #endif
2557
2558 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2559 {
2560 #ifdef SSE_POSSIBLE
2561         int x;
2562         int startx = span->startx;
2563         int endx = span->endx;
2564         int flags;
2565         __m128 data, slope, tcscale;
2566         __m128i tcsize, tcmask, tcoffset, tcmax;
2567         __m128 tc, endtc;
2568         __m128i subtc, substep, endsubtc;
2569         int filter;
2570         int mip;
2571         int affine; // LordHavoc: optimized affine texturing case
2572         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2573         const unsigned char * RESTRICT pixelbase;
2574         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2575         // if no texture is bound, just fill it with white
2576         if (!texture)
2577         {
2578                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2579                 return;
2580         }
2581         mip = triangle->mip[texunitindex];
2582         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2583         // if this mipmap of the texture is 1 pixel, just fill it with that color
2584         if (texture->mipmap[mip][1] == 4)
2585         {
2586                 unsigned int k = *((const unsigned int *)pixelbase);
2587                 for (x = startx;x < endx;x++)
2588                         outi[x] = k;
2589                 return;
2590         }
2591         affine = zf[startx] == zf[endx-1];
2592         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2593         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2594         flags = texture->flags;
2595         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2596         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2597         tcscale = _mm_cvtepi32_ps(tcsize);
2598         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2599         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2600         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2601         if (filter)
2602                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2603         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2604         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2605         tcmax = _mm_packs_epi32(tcmask, tcmask);
2606         for (x = startx;x < endx;)
2607         {
2608                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2609                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2610                 if (nextsub >= endx || affine)
2611                 {
2612                         nextsub = endsub = endx-1;
2613                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2614                 }       
2615                 tc = endtc;
2616                 subtc = endsubtc;
2617                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2618                 if (filter)
2619                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2620                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2621                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2622                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2623                 substep = _mm_slli_epi32(substep, 1);
2624                 if (filter)
2625                 {
2626                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2627                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2628                         {
2629                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2630                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2631                                 {
2632                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2633                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2634                                         tci = _mm_madd_epi16(tci, tcoffset);
2635                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2636                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2637                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2638                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2639                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2640                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2641                                         fracm = _mm_srli_epi16(subtc, 1);
2642                                         pix1 = _mm_add_epi16(pix1,
2643                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2644                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2645                                         pix3 = _mm_add_epi16(pix3,
2646                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2647                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2648                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2649                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2650                                         pix2 = _mm_add_epi16(pix2,
2651                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2652                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2653                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2654                                 }
2655                                 if (x <= endsub)
2656                                 {
2657                                         const unsigned char * RESTRICT ptr1;
2658                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2659                                         tci = _mm_madd_epi16(tci, tcoffset);
2660                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2661                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2662                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2663                                         fracm = _mm_srli_epi16(subtc, 1);
2664                                         pix1 = _mm_add_epi16(pix1,
2665                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2666                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2667                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2668                                         pix1 = _mm_add_epi16(pix1,
2669                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2670                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2671                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2672                                         x++;
2673                                 }
2674                         }
2675                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2676                         {
2677                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2678                                 {
2679                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2680                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2681                                         tci = _mm_madd_epi16(tci, tcoffset);
2682                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2683                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2684                                                                                         _mm_setzero_si128());
2685                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2686                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2687                                                                                         _mm_setzero_si128());
2688                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2689                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2690                                         tci = _mm_madd_epi16(tci, tcoffset);
2691                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2692                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2693                                                                                         _mm_setzero_si128());
2694                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2695                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2696                                                                                         _mm_setzero_si128());
2697                                         fracm = _mm_srli_epi16(subtc, 1);
2698                                         pix1 = _mm_add_epi16(pix1,
2699                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2700                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2701                                         pix3 = _mm_add_epi16(pix3,
2702                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2703                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2704                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2705                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2706                                         pix2 = _mm_add_epi16(pix2,
2707                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2708                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2709                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2710                                 }
2711                                 if (x <= endsub)
2712                                 {
2713                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2714                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2715                                         tci = _mm_madd_epi16(tci, tcoffset);
2716                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2717                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2718                                                                                         _mm_setzero_si128());
2719                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2720                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2721                                                                                         _mm_setzero_si128());
2722                                         fracm = _mm_srli_epi16(subtc, 1);
2723                                         pix1 = _mm_add_epi16(pix1,
2724                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2725                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2726                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2727                                         pix1 = _mm_add_epi16(pix1,
2728                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2729                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2730                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2731                                         x++;
2732                                 }
2733                         }
2734                         else
2735                         {
2736                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2737                                 {
2738                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2739                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2740                                         tci = _mm_madd_epi16(tci, tcoffset);
2741                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2742                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2743                                                                                         _mm_setzero_si128());
2744                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2745                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2746                                                                                         _mm_setzero_si128());
2747                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2748                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2749                                         tci = _mm_madd_epi16(tci, tcoffset);
2750                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2751                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2752                                                                                         _mm_setzero_si128());
2753                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2754                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2755                                                                                         _mm_setzero_si128());
2756                                         fracm = _mm_srli_epi16(subtc, 1);
2757                                         pix1 = _mm_add_epi16(pix1,
2758                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2759                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2760                                         pix3 = _mm_add_epi16(pix3,
2761                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2762                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2763                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2764                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2765                                         pix2 = _mm_add_epi16(pix2,
2766                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2767                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2768                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2769                                 }
2770                                 if (x <= endsub)
2771                                 {
2772                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2773                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2774                                         tci = _mm_madd_epi16(tci, tcoffset);
2775                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2776                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2777                                                                                         _mm_setzero_si128());
2778                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2779                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2780                                                                                         _mm_setzero_si128());
2781                                         fracm = _mm_srli_epi16(subtc, 1);
2782                                         pix1 = _mm_add_epi16(pix1,
2783                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2784                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2785                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2786                                         pix1 = _mm_add_epi16(pix1,
2787                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2788                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2789                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2790                                         x++;
2791                                 }
2792                         }
2793                 }
2794                 else
2795                 {
2796                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2797                         {
2798                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2799                                 {
2800                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2801                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2802                                         tci = _mm_madd_epi16(tci, tcoffset);
2803                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2804                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2805                                 }
2806                                 if (x <= endsub)
2807                                 {
2808                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2809                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2810                                         tci = _mm_madd_epi16(tci, tcoffset);
2811                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2812                                         x++;
2813                                 }
2814                         }
2815                         else
2816                         {
2817                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2818                                 {
2819                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2820                                         tci = _mm_and_si128(tci, tcmax); 
2821                                         tci = _mm_madd_epi16(tci, tcoffset);
2822                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2823                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2824                                 }
2825                                 if (x <= endsub)
2826                                 {
2827                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2828                                         tci = _mm_and_si128(tci, tcmax); 
2829                                         tci = _mm_madd_epi16(tci, tcoffset);
2830                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2831                                         x++;
2832                                 }
2833                         }
2834                 }
2835         }
2836 #endif
2837 }
2838
2839 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2840 {
2841         // TODO: IMPLEMENT
2842         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2843 }
2844
2845 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2846 {
2847         // TODO: IMPLEMENT
2848         return 1.0f;
2849 }
2850
2851 #if 0
2852 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2853 {
2854         int x;
2855         int startx = span->startx;
2856         int endx = span->endx;
2857         float c[4];
2858         float data[4];
2859         float slope[4];
2860         float z;
2861         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2862         for (x = startx;x < endx;x++)
2863         {
2864                 z = zf[x];
2865                 c[0] = (data[0] + slope[0]*x) * z;
2866                 c[1] = (data[1] + slope[1]*x) * z;
2867                 c[2] = (data[2] + slope[2]*x) * z;
2868                 c[3] = (data[3] + slope[3]*x) * z;
2869                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2870                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2871                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2872                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2873         }
2874 }
2875 #endif
2876
2877 #if 0
2878 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2879 {
2880         int x;
2881         int startx = span->startx;
2882         int endx = span->endx;
2883         float c[4];
2884         float data[4];
2885         float slope[4];
2886         float z;
2887         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2888         for (x = startx;x < endx;x++)
2889         {
2890                 z = zf[x];
2891                 c[0] = (data[0] + slope[0]*x) * z;
2892                 c[1] = (data[1] + slope[1]*x) * z;
2893                 c[2] = (data[2] + slope[2]*x) * z;
2894                 c[3] = (data[3] + slope[3]*x) * z;
2895                 out4f[x*4+0] = c[0];
2896                 out4f[x*4+1] = c[1];
2897                 out4f[x*4+2] = c[2];
2898                 out4f[x*4+3] = c[3];
2899         }
2900 }
2901 #endif
2902
2903 #if 0
2904 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2905 {
2906         int x, startx = span->startx, endx = span->endx;
2907         float c[4], localcolor[4];
2908         localcolor[0] = subcolor[0];
2909         localcolor[1] = subcolor[1];
2910         localcolor[2] = subcolor[2];
2911         localcolor[3] = subcolor[3];
2912         for (x = startx;x < endx;x++)
2913         {
2914                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2915                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2916                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2917                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2918                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2919                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2920                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2921                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2922         }
2923 }
2924 #endif
2925
2926 #if 0
2927 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2928 {
2929         int x, startx = span->startx, endx = span->endx;
2930         for (x = startx;x < endx;x++)
2931         {
2932                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2933                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2934                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2935                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2936         }
2937 }
2938 #endif
2939
2940 #if 0
2941 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2942 {
2943         int x, startx = span->startx, endx = span->endx;
2944         for (x = startx;x < endx;x++)
2945         {
2946                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2947                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2948                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2949                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2950         }
2951 }
2952 #endif
2953
2954 #if 0
2955 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2956 {
2957         int x, startx = span->startx, endx = span->endx;
2958         float a, b;
2959         for (x = startx;x < endx;x++)
2960         {
2961                 a = 1.0f - inb4f[x*4+3];
2962                 b = inb4f[x*4+3];
2963                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2964                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2965                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2966                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2967         }
2968 }
2969 #endif
2970
2971 #if 0
2972 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2973 {
2974         int x, startx = span->startx, endx = span->endx;
2975         float localcolor[4], ilerp, lerp;
2976         localcolor[0] = color[0];
2977         localcolor[1] = color[1];
2978         localcolor[2] = color[2];
2979         localcolor[3] = color[3];
2980         ilerp = 1.0f - localcolor[3];
2981         lerp = localcolor[3];
2982         for (x = startx;x < endx;x++)
2983         {
2984                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2985                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2986                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2987                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2988         }
2989 }
2990 #endif
2991
2992
2993
2994 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2995 {
2996 #ifdef SSE_POSSIBLE
2997         int x;
2998         int startx = span->startx;
2999         int endx = span->endx;
3000         __m128 data, slope;
3001         __m128 mod, endmod;
3002         __m128i submod, substep, endsubmod;
3003         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3004         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3005         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3006         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3007         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3008         for (x = startx; x < endx;)
3009         {
3010                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3011                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3012                 if (nextsub >= endx)
3013                 {
3014                         nextsub = endsub = endx-1;
3015                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3016                 }
3017                 mod = endmod;
3018                 submod = endsubmod;
3019                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3020                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3021                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3022                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3023                 substep = _mm_packs_epi32(substep, substep);
3024                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3025                 {
3026                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3027                         pix = _mm_mulhi_epu16(pix, submod);
3028                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3029                 }
3030                 if (x <= endsub)
3031                 {
3032                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3033                         pix = _mm_mulhi_epu16(pix, submod);
3034                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3035                         x++;
3036                 }
3037         }
3038 #endif
3039 }
3040
3041 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3042 {
3043 #ifdef SSE_POSSIBLE
3044         int x;
3045         int startx = span->startx;
3046         int endx = span->endx;
3047         __m128 data, slope;
3048         __m128 mod, endmod;
3049         __m128i submod, substep, endsubmod;
3050         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3051         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3052         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3053         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3054         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3055         for (x = startx; x < endx;)
3056         {
3057                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3058                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3059                 if (nextsub >= endx)
3060                 {
3061                         nextsub = endsub = endx-1;
3062                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3063                 }
3064                 mod = endmod;
3065                 submod = endsubmod;
3066                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3067                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3068                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3069                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3070                 substep = _mm_packs_epi32(substep, substep);
3071                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3072                 {
3073                         __m128i pix = _mm_srai_epi16(submod, 4);
3074                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3075                 }
3076                 if (x <= endsub)
3077                 {
3078                         __m128i pix = _mm_srai_epi16(submod, 4);
3079                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3080                         x++;
3081                 }
3082         }
3083 #endif
3084 }
3085
3086 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3087 {
3088 #ifdef SSE_POSSIBLE
3089         int x, startx = span->startx, endx = span->endx;
3090         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3091         localcolor = _mm_packs_epi32(localcolor, localcolor);
3092         for (x = startx;x+2 <= endx;x+=2)
3093         {
3094                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3095                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3096                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3097                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3098         }
3099         if (x < endx)
3100         {
3101                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3102                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3103                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3104                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3105         }
3106 #endif
3107 }
3108
3109 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3110 {
3111 #ifdef SSE_POSSIBLE
3112         int x, startx = span->startx, endx = span->endx;
3113         for (x = startx;x+2 <= endx;x+=2)
3114         {
3115                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3116                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3117                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3118                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3119         }
3120         if (x < endx)
3121         {
3122                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3123                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3124                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3125                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3126         }
3127 #endif
3128 }
3129
3130 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3131 {
3132 #ifdef SSE_POSSIBLE
3133         int x, startx = span->startx, endx = span->endx;
3134         for (x = startx;x+2 <= endx;x+=2)
3135         {
3136                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3137                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3138                 pix1 = _mm_add_epi16(pix1, pix2);
3139                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3140         }
3141         if (x < endx)
3142         {
3143                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3144                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3145                 pix1 = _mm_add_epi16(pix1, pix2);
3146                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3147         }
3148 #endif
3149 }
3150
3151 #if 0
3152 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3153 {
3154 #ifdef SSE_POSSIBLE
3155         int x, startx = span->startx, endx = span->endx;
3156         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3157         tint = _mm_packs_epi32(tint, tint);
3158         for (x = startx;x+2 <= endx;x+=2)
3159         {
3160                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3161                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3162                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3163                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3164         }
3165         if (x < endx)
3166         {
3167                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3168                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3169                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3170                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3171         }
3172 #endif
3173 }
3174 #endif
3175
3176 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3177 {
3178 #ifdef SSE_POSSIBLE
3179         int x, startx = span->startx, endx = span->endx;
3180         for (x = startx;x+2 <= endx;x+=2)
3181         {
3182                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3183                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3184                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3185                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3186                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3187         }
3188         if (x < endx)
3189         {
3190                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3191                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3192                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3193                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3194                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3195         }
3196 #endif
3197 }
3198
3199 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3200 {
3201 #ifdef SSE_POSSIBLE
3202         int x, startx = span->startx, endx = span->endx;
3203         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3204         localcolor = _mm_packs_epi32(localcolor, localcolor);
3205         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3206         for (x = startx;x+2 <= endx;x+=2)
3207         {
3208                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3209                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3210                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3211         }
3212         if (x < endx)
3213         {
3214                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3215                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3216                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3217         }
3218 #endif
3219 }
3220
3221
3222
3223 static void DPSOFTRAST_VertexShader_Generic(void)
3224 {
3225         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3226         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3227         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3228         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3229                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3230 }
3231
3232 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3233 {
3234         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3235         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3236         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3237         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3238         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3239         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3240         {
3241                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3242                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3243                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3244                 {
3245                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3246                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3247                         {
3248                                 // multiply
3249                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3250                         }
3251                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3252                         {
3253                                 // add
3254                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3255                         }
3256                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3257                         {
3258                                 // alphablend
3259                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3260                         }
3261                 }
3262         }
3263         else
3264                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3265         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3266 }
3267
3268
3269
3270 static void DPSOFTRAST_VertexShader_PostProcess(void)
3271 {
3272         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3273         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3274         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3275 }
3276
3277 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3278 {
3279         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3280         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3281         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3282         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3283         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3284         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3285         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3286         {
3287                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3288                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3289         }
3290         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3291         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3292         {
3293                 // TODO: implement saturation
3294         }
3295         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3296         {
3297                 // TODO: implement gammaramps
3298         }
3299         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3300 }
3301
3302
3303
3304 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3305 {
3306         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3307 }
3308
3309 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3310 {
3311         // this is never called (because colormask is off when this shader is used)
3312         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3313         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3314         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3315         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3316         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3317 }
3318
3319
3320
3321 static void DPSOFTRAST_VertexShader_FlatColor(void)
3322 {
3323         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3324         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3325 }
3326
3327 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3328 {
3329 #ifdef SSE_POSSIBLE
3330         unsigned char * RESTRICT pixelmask = span->pixelmask;
3331         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3332         int x, startx = span->startx, endx = span->endx;
3333         __m128i Color_Ambientm;
3334         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3335         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3336         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3337         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3338         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3339         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3340                 pixel = buffer_FragColorbgra8;
3341         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3342         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3343         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3344         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3345         for (x = startx;x < endx;x++)
3346         {
3347                 __m128i color, pix;
3348                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3349                 {
3350                         __m128i pix2;
3351                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3352                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3353                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3354                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3355                         x += 3;
3356                         continue;
3357                 }
3358                 if (!pixelmask[x])
3359                         continue;
3360                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3361                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3362                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3363         }
3364         if (pixel == buffer_FragColorbgra8)
3365                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3366 #endif
3367 }
3368
3369
3370
3371 static void DPSOFTRAST_VertexShader_VertexColor(void)
3372 {
3373         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3374         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3375         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3376 }
3377
3378 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3379 {
3380 #ifdef SSE_POSSIBLE
3381         unsigned char * RESTRICT pixelmask = span->pixelmask;
3382         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3383         int x, startx = span->startx, endx = span->endx;
3384         __m128i Color_Ambientm, Color_Diffusem;
3385         __m128 data, slope;
3386         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3387         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3388         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3389         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3390         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3391         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3392         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3393                 pixel = buffer_FragColorbgra8;
3394         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3395         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3396         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3397         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3398         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3399         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3400         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3401         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3402         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3403         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3404         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3405         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3406         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3407         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3408         {
3409                 __m128i color, mod, pix;
3410                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3411                 {
3412                         __m128i pix2, mod2;
3413                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3414                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3415                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3416                         data = _mm_add_ps(data, slope);
3417                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3418                         data = _mm_add_ps(data, slope);
3419                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3420                         data = _mm_add_ps(data, slope);
3421                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3422                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3423                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3424                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3425                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3426                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3427                         x += 3;
3428                         continue;
3429                 }
3430                 if (!pixelmask[x])
3431                         continue;
3432                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3433                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3434                 mod = _mm_packs_epi32(mod, mod);
3435                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3436                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3437         }
3438         if (pixel == buffer_FragColorbgra8)
3439                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3440 #endif
3441 }
3442
3443
3444
3445 static void DPSOFTRAST_VertexShader_Lightmap(void)
3446 {
3447         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3448         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3449         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3450 }
3451
3452 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3453 {
3454 #ifdef SSE_POSSIBLE
3455         unsigned char * RESTRICT pixelmask = span->pixelmask;
3456         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3457         int x, startx = span->startx, endx = span->endx;
3458         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3459         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3460         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3461         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3462         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3463         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3464         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3465         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3466         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3467         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3468                 pixel = buffer_FragColorbgra8;
3469         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3470         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3471         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3472         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3473         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3474         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3475         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3476         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3477         {
3478                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3479                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3480                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3481                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3482                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3483                 for (x = startx;x < endx;x++)
3484                 {
3485                         __m128i color, lightmap, glow, pix;
3486                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3487                         {
3488                                 __m128i pix2;
3489                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3490                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3491                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3492                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3493                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3494                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3495                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3496                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3497                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3498                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3499                                 x += 3;
3500                                 continue;
3501                         }
3502                         if (!pixelmask[x])
3503                                 continue;
3504                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3505                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3506                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3507                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3508                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3509                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3510                 }
3511         }
3512         else
3513         {
3514                 for (x = startx;x < endx;x++)
3515                 {
3516                         __m128i color, lightmap, pix;
3517                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3518                         {
3519                                 __m128i pix2;
3520                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3521                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3522                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3523                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3524                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3525                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3526                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3527                                 x += 3;
3528                                 continue;
3529                         }
3530                         if (!pixelmask[x]) 
3531                                 continue;
3532                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3533                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3534                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3535                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3536                 }
3537         }
3538         if (pixel == buffer_FragColorbgra8)
3539                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3540 #endif
3541 }
3542
3543
3544 void DPSOFTRAST_VertexShader_LightDirection(void);
3545 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3546
3547 static void DPSOFTRAST_VertexShader_FakeLight(void)
3548 {
3549         DPSOFTRAST_VertexShader_LightDirection();
3550 }
3551
3552 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3553 {
3554         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3555 }
3556
3557
3558
3559 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3560 {
3561         DPSOFTRAST_VertexShader_LightDirection();
3562         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3563 }
3564
3565 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3566 {
3567         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3568 }
3569
3570
3571
3572 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3573 {
3574         DPSOFTRAST_VertexShader_LightDirection();
3575         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3576 }
3577
3578 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3579 {
3580         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3581 }
3582
3583
3584
3585 void DPSOFTRAST_VertexShader_LightDirection(void)
3586 {
3587         int i;
3588         int numvertices = dpsoftrast.numvertices;
3589         float LightDir[4];
3590         float LightVector[4];
3591         float EyePosition[4];
3592         float EyeVectorModelSpace[4];
3593         float EyeVector[4];
3594         float position[4];
3595         float svector[4];
3596         float tvector[4];
3597         float normal[4];
3598         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3599         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3600         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3601         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3602         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3603         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3604         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3605         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3606         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3607         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3608         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3609         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3610         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3611         for (i = 0;i < numvertices;i++)
3612         {
3613                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3614                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3615                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3616                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3617                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3618                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3619                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3620                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3621                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3622                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3623                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3624                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3625                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3626                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3627                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3628                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3629                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3630                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3631                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3632                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3633                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3634                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3635                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3636                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3637                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3638                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3639                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3640                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3641                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3642         }
3643         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3644 }
3645
3646 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3647 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3648 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3649 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3650 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3651 #define DPSOFTRAST_Vector3Normalize(v)\
3652 do\
3653 {\
3654         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3655         if (len)\
3656         {\
3657                 len = 1.0f / len;\
3658                 v[0] *= len;\
3659                 v[1] *= len;\
3660                 v[2] *= len;\
3661         }\
3662 }\
3663 while(0)
3664
3665 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3666 {
3667         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3668         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3669         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3670         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3671         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3672         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3673         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3674         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3675         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3676         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3677         int x, startx = span->startx, endx = span->endx;
3678         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3679         float LightVectordata[4];
3680         float LightVectorslope[4];
3681         float EyeVectordata[4];
3682         float EyeVectorslope[4];
3683         float VectorSdata[4];
3684         float VectorSslope[4];
3685         float VectorTdata[4];
3686         float VectorTslope[4];
3687         float VectorRdata[4];
3688         float VectorRslope[4];
3689         float z;
3690         float diffusetex[4];
3691         float glosstex[4];
3692         float surfacenormal[4];
3693         float lightnormal[4];
3694         float lightnormal_modelspace[4];
3695         float eyenormal[4];
3696         float specularnormal[4];
3697         float diffuse;
3698         float specular;
3699         float SpecularPower;
3700         int d[4];
3701         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3702         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3703         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3704         Color_Glow[3] = 0.0f;
3705         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3706         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3707         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3708         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3709         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3710         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3711         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3712         Color_Pants[3] = 0.0f;
3713         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3714         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3715         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3716         Color_Shirt[3] = 0.0f;
3717         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3718         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3719         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3720         {
3721                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3722                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3723         }
3724         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3725         {
3726                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3727         }
3728         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3729         {
3730                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3731                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3732                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3733                 Color_Diffuse[3] = 0.0f;
3734                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3735                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3736                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3737                 LightColor[3] = 0.0f;
3738                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3739                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3740                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3741                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3742                 Color_Specular[3] = 0.0f;
3743                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3744                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3745                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3746
3747                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3748                 {
3749                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3750                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3751                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3752                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3753                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3754                 }
3755                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3756                 {
3757                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3758                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3759                 }
3760                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3761                 {
3762                         // nothing of this needed
3763                 }
3764                 else
3765                 {
3766                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3767                 }
3768
3769                 for (x = startx;x < endx;x++)
3770                 {
3771                         z = buffer_z[x];
3772                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3773                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3774                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3775                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3776                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3777                         {
3778                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3779                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3780                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3781                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3782                         }
3783                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3784                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3785                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3786                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3787                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3788                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3789                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3790                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3791
3792                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3793                         {
3794                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3795                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3796                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3797                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3798
3799                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3800                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3801                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3802                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3803
3804                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3805                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3806                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3807                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3808
3809                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3810                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3811                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3812                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3813
3814                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3815                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3816
3817                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3818                                 {
3819                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3820                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3821                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3822                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3823                                 }
3824                         }
3825                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3826                         {
3827                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3828                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3829                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3830                                 {
3831                                         float f = 1.0f / 256.0f;
3832                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3833                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3834                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3835                                 }
3836                         }
3837                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3838                         {
3839                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3840                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3841                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3842                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3843
3844                                 LightColor[0] = 1.0;
3845                                 LightColor[1] = 1.0;
3846                                 LightColor[2] = 1.0;
3847                         }
3848                         else
3849                         {
3850                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3851                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3852                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3853                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3854                         }
3855
3856                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3857
3858                         if(thread->shader_exactspecularmath)
3859                         {
3860                                 // reflect lightnormal at surfacenormal, take the negative of that
3861                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3862                                 float f;
3863                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3864                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3865                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3866                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3867
3868                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3869                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3870                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3871                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3872                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3873
3874                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3875                         }
3876                         else
3877                         {
3878                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3879                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3880                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3881                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3882
3883                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3884                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3885                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3886                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3887
3888                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3889                         }
3890                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3891
3892                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3893                         {
3894                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3895                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3896                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3897                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3898                         }
3899                         else
3900                         {
3901                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3902                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3903                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3904                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3905                         }
3906
3907                         buffer_FragColorbgra8[x*4+0] = d[0];
3908                         buffer_FragColorbgra8[x*4+1] = d[1];
3909                         buffer_FragColorbgra8[x*4+2] = d[2];
3910                         buffer_FragColorbgra8[x*4+3] = d[3];
3911                 }
3912         }
3913         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3914         {
3915                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3916                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3917                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3918                 Color_Diffuse[3] = 0.0f;
3919                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3920                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3921                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3922                 LightColor[3] = 0.0f;
3923                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3924
3925                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3926                 {
3927                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3928                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3929                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3930                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3931                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3932                 }
3933                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3934                 {
3935                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3936                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3937                 }
3938                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3939                 {
3940                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3941                 }
3942                 else
3943                 {
3944                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3945                 }
3946
3947                 for (x = startx;x < endx;x++)
3948                 {
3949                         z = buffer_z[x];
3950                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3951                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3952                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3953                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3954                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3955                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3956                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3957                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3958
3959                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3960                         {
3961                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3962                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3963                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3964                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3965
3966                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3967                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3968                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3969                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3970
3971                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3972                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3973                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3974                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3975
3976                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3977                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3978                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3979                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3980
3981                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3982                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3983
3984                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3985                                 {
3986                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3987                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3988                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3989                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3990                                 }
3991                         }
3992                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3993                         {
3994                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3995                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3996                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3997                                 {
3998                                         float f = 1.0f / 256.0f;
3999                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4000                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4001                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4002                                 }
4003                         }
4004                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4005                         {
4006                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4007                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4008                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4009                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4010
4011                                 LightColor[0] = 1.0;
4012                                 LightColor[1] = 1.0;
4013                                 LightColor[2] = 1.0;
4014                         }
4015                         else
4016                         {
4017                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4018                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4019                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4020                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4021                         }
4022
4023                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4024                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4025                         {
4026                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4027                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4028                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4029                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4030                         }
4031                         else
4032                         {
4033                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4034                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4035                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4036                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4037                         }
4038                         buffer_FragColorbgra8[x*4+0] = d[0];
4039                         buffer_FragColorbgra8[x*4+1] = d[1];
4040                         buffer_FragColorbgra8[x*4+2] = d[2];
4041                         buffer_FragColorbgra8[x*4+3] = d[3];
4042                 }
4043         }
4044         else
4045         {
4046                 for (x = startx;x < endx;x++)
4047                 {
4048                         z = buffer_z[x];
4049                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4050                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4051                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4052                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4053
4054                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4055                         {
4056                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4057                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4058                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4059                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4060                         }
4061                         else
4062                         {
4063                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4064                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4065                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4066                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4067                         }
4068                         buffer_FragColorbgra8[x*4+0] = d[0];
4069                         buffer_FragColorbgra8[x*4+1] = d[1];
4070                         buffer_FragColorbgra8[x*4+2] = d[2];
4071                         buffer_FragColorbgra8[x*4+3] = d[3];
4072                 }
4073         }
4074         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4075 }
4076
4077
4078
4079 static void DPSOFTRAST_VertexShader_LightSource(void)
4080 {
4081         int i;
4082         int numvertices = dpsoftrast.numvertices;
4083         float LightPosition[4];
4084         float LightVector[4];
4085         float LightVectorModelSpace[4];
4086         float EyePosition[4];
4087         float EyeVectorModelSpace[4];
4088         float EyeVector[4];
4089         float position[4];
4090         float svector[4];
4091         float tvector[4];
4092         float normal[4];
4093         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4094         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4095         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4096         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4097         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4098         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4099         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4100         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4101         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4102         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4103         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4104         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4105         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4106         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4107         for (i = 0;i < numvertices;i++)
4108         {
4109                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4110                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4111                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4112                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4113                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4114                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4115                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4116                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4117                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4118                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4119                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4120                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4121                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4122                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4123                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4124                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4125                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4126                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4127                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4128                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4129                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4130                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4131                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4132                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4133                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4134                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4135                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4136                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4137                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4138                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4139                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4140                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4141         }
4142         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4143         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4144 }
4145
4146 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4147 {
4148 #ifdef SSE_POSSIBLE
4149         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4150         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4151         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4152         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4153         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4154         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4155         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4156         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4157         int x, startx = span->startx, endx = span->endx;
4158         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4159         float CubeVectordata[4];
4160         float CubeVectorslope[4];
4161         float LightVectordata[4];
4162         float LightVectorslope[4];
4163         float EyeVectordata[4];
4164         float EyeVectorslope[4];
4165         float z;
4166         float diffusetex[4];
4167         float glosstex[4];
4168         float surfacenormal[4];
4169         float lightnormal[4];
4170         float eyenormal[4];
4171         float specularnormal[4];
4172         float diffuse;
4173         float specular;
4174         float SpecularPower;
4175         float CubeVector[4];
4176         float attenuation;
4177         int d[4];
4178         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4179         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4180         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4181         Color_Glow[3] = 0.0f;
4182         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4183         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4184         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4185         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4186         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4187         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4188         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4189         Color_Diffuse[3] = 0.0f;
4190         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4191         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4192         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4193         Color_Specular[3] = 0.0f;
4194         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4195         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4196         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4197         Color_Pants[3] = 0.0f;
4198         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4199         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4200         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4201         Color_Shirt[3] = 0.0f;
4202         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4203         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4204         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4205         LightColor[3] = 0.0f;
4206         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4207         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4208         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4209         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4210         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4211         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4212         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4213         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4214         {
4215                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4216                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4217         }
4218         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4219                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4220         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4221         {
4222                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4223                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4224                 for (x = startx;x < endx;x++)
4225                 {
4226                         z = buffer_z[x];
4227                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4228                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4229                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4230                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4231                         if (attenuation < 0.01f)
4232                                 continue;
4233                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4234                         {
4235                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4236                                 if (attenuation < 0.01f)
4237                                         continue;
4238                         }
4239
4240                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4241                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4242                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4243                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4244                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4245                         {
4246                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4247                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4248                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4249                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4250                         }
4251                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4252                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4253                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4254                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4255                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4256                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4257                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4258                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4259
4260                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4261                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4262                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4263                         DPSOFTRAST_Vector3Normalize(lightnormal);
4264
4265                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4266
4267                         if(thread->shader_exactspecularmath)
4268                         {
4269                                 // reflect lightnormal at surfacenormal, take the negative of that
4270                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4271                                 float f;
4272                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4273                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4274                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4275                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4276
4277                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4278                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4279                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4280                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4281                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4282
4283                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4284                         }
4285                         else
4286                         {
4287                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4288                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4289                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4290                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4291
4292                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4293                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4294                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4295                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4296
4297                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4298                         }
4299                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4300
4301                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4302                         {
4303                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4304                                 attenuation *= (1.0f / 255.0f);
4305                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4306                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4307                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4308                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4309                         }
4310                         else
4311                         {
4312                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4313                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4314                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4315                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4316                         }
4317                         buffer_FragColorbgra8[x*4+0] = d[0];
4318                         buffer_FragColorbgra8[x*4+1] = d[1];
4319                         buffer_FragColorbgra8[x*4+2] = d[2];
4320                         buffer_FragColorbgra8[x*4+3] = d[3];
4321                 }
4322         }
4323         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4324         {
4325                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4326                 for (x = startx;x < endx;x++)
4327                 {
4328                         z = buffer_z[x];
4329                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4330                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4331                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4332                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4333                         if (attenuation < 0.01f)
4334                                 continue;
4335                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4336                         {
4337                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4338                                 if (attenuation < 0.01f)
4339                                         continue;
4340                         }
4341
4342                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4343                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4344                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4345                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4346                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4347                         {
4348                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4349                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4350                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4351                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4352                         }
4353                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4354                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4355                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4356                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4357
4358                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4359                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4360                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4361                         DPSOFTRAST_Vector3Normalize(lightnormal);
4362
4363                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4364                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4365                         {
4366                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4367                                 attenuation *= (1.0f / 255.0f);
4368                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4369                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4370                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4371                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4372                         }
4373                         else
4374                         {
4375                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4376                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4377                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4378                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4379                         }
4380                         buffer_FragColorbgra8[x*4+0] = d[0];
4381                         buffer_FragColorbgra8[x*4+1] = d[1];
4382                         buffer_FragColorbgra8[x*4+2] = d[2];
4383                         buffer_FragColorbgra8[x*4+3] = d[3];
4384                 }
4385         }
4386         else
4387         {
4388                 for (x = startx;x < endx;x++)
4389                 {
4390                         z = buffer_z[x];
4391                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4392                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4393                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4394                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4395                         if (attenuation < 0.01f)
4396                                 continue;
4397                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4398                         {
4399                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4400                                 if (attenuation < 0.01f)
4401                                         continue;
4402                         }
4403
4404                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4405                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4406                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4407                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4408                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4409                         {
4410                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4411                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4412                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4413                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4414                         }
4415                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4416                         {
4417                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4418                                 attenuation *= (1.0f / 255.0f);
4419                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4420                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4421                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4422                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4423                         }
4424                         else
4425                         {
4426                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4427                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4428                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4429                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4430                         }
4431                         buffer_FragColorbgra8[x*4+0] = d[0];
4432                         buffer_FragColorbgra8[x*4+1] = d[1];
4433                         buffer_FragColorbgra8[x*4+2] = d[2];
4434                         buffer_FragColorbgra8[x*4+3] = d[3];
4435                 }
4436         }
4437         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4438 #endif
4439 }
4440
4441
4442
4443 static void DPSOFTRAST_VertexShader_Refraction(void)
4444 {
4445         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4446         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4447         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4448 }
4449
4450 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4451 {
4452         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4453         float z;
4454         int x, startx = span->startx, endx = span->endx;
4455
4456         // texture reads
4457         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4458         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4459
4460         // varyings
4461         float ModelViewProjectionPositiondata[4];
4462         float ModelViewProjectionPositionslope[4];
4463
4464         // uniforms
4465         float ScreenScaleRefractReflect[2];
4466         float ScreenCenterRefractReflect[2];
4467         float DistortScaleRefractReflect[2];
4468         float RefractColor[4];
4469
4470         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4471         if(!texture) return;
4472
4473         // read textures
4474         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4475         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4476
4477         // read varyings
4478         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4479
4480         // read uniforms
4481         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4482         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4483         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4484         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4485         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4486         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4487         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4488         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4489         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4490         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4491
4492         // do stuff
4493         for (x = startx;x < endx;x++)
4494         {
4495                 float SafeScreenTexCoord[2];
4496                 float ScreenTexCoord[2];
4497                 float v[3];
4498                 float iw;
4499                 unsigned char c[4];
4500
4501                 z = buffer_z[x];
4502
4503                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4504                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4505
4506                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4507                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4508                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4509
4510                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4511                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4512                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4513                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4514                 DPSOFTRAST_Vector3Normalize(v);
4515                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4516                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4517
4518                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4519                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4520
4521                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4522                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4523                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4524                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4525         }
4526
4527         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4528 }
4529
4530
4531
4532 static void DPSOFTRAST_VertexShader_Water(void)
4533 {
4534         int i;
4535         int numvertices = dpsoftrast.numvertices;
4536         float EyePosition[4];
4537         float EyeVectorModelSpace[4];
4538         float EyeVector[4];
4539         float position[4];
4540         float svector[4];
4541         float tvector[4];
4542         float normal[4];
4543         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4544         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4545         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4546         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4547         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4548         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4549         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4550         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4551         for (i = 0;i < numvertices;i++)
4552         {
4553                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4554                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4555                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4556                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4557                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4558                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4559                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4560                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4561                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4562                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4563                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4564                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4565                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4566                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4567                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4568                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4569                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4570                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4571                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4572                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4573                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4574                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4575         }
4576         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4577         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4578         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4579 }
4580
4581
4582 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4583 {
4584         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4585         float z;
4586         int x, startx = span->startx, endx = span->endx;
4587
4588         // texture reads
4589         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4590         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4591
4592         // varyings
4593         float ModelViewProjectionPositiondata[4];
4594         float ModelViewProjectionPositionslope[4];
4595         float EyeVectordata[4];
4596         float EyeVectorslope[4];
4597
4598         // uniforms
4599         float ScreenScaleRefractReflect[4];
4600         float ScreenCenterRefractReflect[4];
4601         float DistortScaleRefractReflect[4];
4602         float RefractColor[4];
4603         float ReflectColor[4];
4604         float ReflectFactor;
4605         float ReflectOffset;
4606
4607         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4608         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4609         if(!texture_refraction || !texture_reflection) return;
4610
4611         // read textures
4612         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4613         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4614
4615         // read varyings
4616         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4617         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4618
4619         // read uniforms
4620         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4621         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4622         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4623         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4624         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4625         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4626         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4627         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4628         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4629         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4630         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4631         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4632         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4633         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4634         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4635         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4636         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4637         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4638         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4639         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4640         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4641         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4642
4643         // do stuff
4644         for (x = startx;x < endx;x++)
4645         {
4646                 float SafeScreenTexCoord[4];
4647                 float ScreenTexCoord[4];
4648                 float v[3];
4649                 float iw;
4650                 unsigned char c1[4];
4651                 unsigned char c2[4];
4652                 float Fresnel;
4653
4654                 z = buffer_z[x];
4655
4656                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4657                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4658
4659                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4660                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4661                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4662                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4663                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4664
4665                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4666                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4667                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4668                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4669                 DPSOFTRAST_Vector3Normalize(v);
4670                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4671                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4672                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4673                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4674
4675                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4676                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4677                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4678                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4679                 DPSOFTRAST_Vector3Normalize(v);
4680                 Fresnel = 1.0f - v[2];
4681                 Fresnel = min(1.0f, Fresnel);
4682                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4683
4684                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4685                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4686                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4687                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4688
4689                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4690                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4691                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4692                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4693         }
4694
4695         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4696 }
4697
4698
4699
4700 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4701 {
4702         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4703 }
4704
4705 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4706 {
4707         // TODO: IMPLEMENT
4708         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4709         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4710         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4711         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4712         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4713 }
4714
4715
4716
4717 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4718 {
4719         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4720 }
4721
4722 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4723 {
4724         // TODO: IMPLEMENT
4725         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4726         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4727         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4728         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4729         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4730 }
4731
4732
4733
4734 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4735 {
4736         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4737 }
4738
4739 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4740 {
4741         // TODO: IMPLEMENT
4742         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4743         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4744         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4745         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4746         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4747 }
4748
4749
4750
4751 typedef struct DPSOFTRAST_ShaderModeInfo_s
4752 {
4753         int lodarrayindex;
4754         void (*Vertex)(void);
4755         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4756         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4757         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4758 }
4759 DPSOFTRAST_ShaderModeInfo;
4760
4761 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4762 {
4763         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4764         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4765         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4766         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4767         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4768         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4769         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4770         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4771         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4772         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4773         {2, DPSOFTRAST_VertexShader_VertexColor,                        DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4774         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4775         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4776         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4777         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4778         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4779         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4780         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4781 };
4782
4783 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4784 {
4785         int x;
4786         int startx;
4787         int endx;
4788         unsigned int *depthpixel;
4789         int depth;
4790         int depthslope;
4791         unsigned int d;
4792         unsigned char *pixelmask;
4793         DPSOFTRAST_State_Triangle *triangle;
4794         triangle = &thread->triangles[span->triangle];
4795         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4796         startx = span->startx;
4797         endx = span->endx;
4798         depth = span->depthbase;
4799         depthslope = span->depthslope;
4800         pixelmask = thread->pixelmaskarray;
4801         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4802         {
4803                 switch(thread->fb_depthfunc)
4804                 {
4805                 default:
4806                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4807                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4808                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4809                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4810                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4811                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4812                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4813                 }
4814                 while (startx < endx && !pixelmask[startx])
4815                         startx++;
4816                 while (endx > startx && !pixelmask[endx-1])
4817                         endx--;
4818         }
4819         else
4820         {
4821                 // no depth testing means we're just dealing with color...
4822                 memset(pixelmask + startx, 1, endx - startx);
4823         }
4824         span->pixelmask = pixelmask;
4825         span->startx = startx;
4826         span->endx = endx;
4827 }
4828
4829 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4830 {
4831         int x, d, depth, depthslope, startx, endx;
4832         const unsigned char *pixelmask;
4833         unsigned int *depthpixel;
4834         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4835         {
4836                 depth = span->depthbase;
4837                 depthslope = span->depthslope;
4838                 pixelmask = span->pixelmask;
4839                 startx = span->startx;
4840                 endx = span->endx;
4841                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4842                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4843                         if (pixelmask[x])
4844                                 depthpixel[x] = d;
4845         }
4846 }
4847
4848 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4849 {
4850         int i;
4851         DPSOFTRAST_State_Triangle *triangle;
4852         DPSOFTRAST_State_Span *span;
4853         for (i = 0; i < thread->numspans; i++)
4854         {
4855                 span = &thread->spans[i];
4856                 triangle = &thread->triangles[span->triangle];
4857                 DPSOFTRAST_Draw_DepthTest(thread, span);
4858                 if (span->startx >= span->endx)
4859                         continue;
4860                 // run pixel shader if appropriate
4861                 // do this before running depthmask code, to allow the pixelshader
4862                 // to clear pixelmask values for alpha testing
4863                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4864                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4865                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4866         }
4867         thread->numspans = 0;
4868 }
4869
4870 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4871
4872 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4873 {
4874 #ifdef SSE_POSSIBLE
4875         int cullface = thread->cullface;
4876         int minx, maxx, miny, maxy;
4877         int miny1, maxy1, miny2, maxy2;
4878         __m128i fbmin, fbmax;
4879         __m128 viewportcenter, viewportscale;
4880         int firstvertex = command->firstvertex;
4881         int numvertices = command->numvertices;
4882         int numtriangles = command->numtriangles;
4883         const int *element3i = command->element3i;
4884         const unsigned short *element3s = command->element3s;
4885         int clipped = command->clipped;
4886         int i;
4887         int j;
4888         int k;
4889         int y;
4890         int e[3];
4891         __m128i screeny;
4892         int starty, endy, bandy;
4893         int numpoints;
4894         int clipcase;
4895         float clipdist[4];
4896         float clip0origin, clip0slope;
4897         int clip0dir;
4898         __m128 triangleedge1, triangleedge2, trianglenormal;
4899         __m128 clipfrac[3];
4900         __m128 screen[4];
4901         DPSOFTRAST_State_Triangle *triangle;
4902         DPSOFTRAST_Texture *texture;
4903         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4904         miny = thread->fb_scissor[1];
4905         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4906         miny1 = bound(miny, thread->miny1, maxy);
4907         maxy1 = bound(miny, thread->maxy1, maxy);
4908         miny2 = bound(miny, thread->miny2, maxy);
4909         maxy2 = bound(miny, thread->maxy2, maxy);
4910         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4911         {
4912                 if (!ATOMIC_DECREMENT(command->refcount))
4913                 {
4914                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4915                                 MM_FREE(command->arrays);
4916                 }
4917                 return;
4918         }
4919         minx = thread->fb_scissor[0];
4920         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4921         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4922         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4923         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4924         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4925         screen[3] = _mm_setzero_ps();
4926         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4927         for (i = 0;i < numtriangles;i++)
4928         {
4929                 const float *screencoord4f = command->arrays;
4930                 const float *arrays = screencoord4f + numvertices*4;
4931
4932                 // generate the 3 edges of this triangle
4933                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4934                 if (element3s)
4935                 {
4936                         e[0] = element3s[i*3+0] - firstvertex;
4937                         e[1] = element3s[i*3+1] - firstvertex;
4938                         e[2] = element3s[i*3+2] - firstvertex;
4939                 }
4940                 else if (element3i)
4941                 {
4942                         e[0] = element3i[i*3+0] - firstvertex;
4943                         e[1] = element3i[i*3+1] - firstvertex;
4944                         e[2] = element3i[i*3+2] - firstvertex;
4945                 }
4946                 else
4947                 {
4948                         e[0] = i*3+0;
4949                         e[1] = i*3+1;
4950                         e[2] = i*3+2;
4951                 }
4952
4953 #define SKIPBACKFACE \
4954                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4955                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4956                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4957                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4958                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4959                 switch(cullface) \
4960                 { \
4961                 case GL_BACK: \
4962                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4963                                 continue; \
4964                         break; \
4965                 case GL_FRONT: \
4966                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4967                                 continue; \
4968                         break; \
4969                 }
4970
4971 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4972                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4973                         { \
4974                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4975                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4976                         }
4977 #define CLIPPEDVERTEXCOPY(k,p1) \
4978                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4979
4980 #define GENATTRIBCOPY(attrib, p1) \
4981                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4982 #define GENATTRIBLERP(attrib, p1, p2) \
4983                 { \
4984                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4985                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4986                 }
4987 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4988                 switch(clipcase) \
4989                 { \
4990                 default: \
4991                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4992                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4993                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4994                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4995                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4996                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4997                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4998                 }
4999
5000                 if (! clipped)
5001                         goto notclipped;
5002
5003                 // calculate distance from nearplane
5004                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5005                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5006                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5007                 if (clipdist[0] >= 0.0f)
5008                 {
5009                         if (clipdist[1] >= 0.0f)
5010                         {
5011                                 if (clipdist[2] >= 0.0f)
5012                                 {
5013                                 notclipped:
5014                                         // triangle is entirely in front of nearplane
5015                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5016                                         SKIPBACKFACE;
5017                                         numpoints = 3;
5018                                         clipcase = 0;
5019                                 }
5020                                 else
5021                                 {
5022                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5023                                         SKIPBACKFACE;
5024                                         numpoints = 4;
5025                                         clipcase = 1;
5026                                 }
5027                         }
5028                         else
5029                         {
5030                                 if (clipdist[2] >= 0.0f)
5031                                 {
5032                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5033                                         SKIPBACKFACE;
5034                                         numpoints = 4;
5035                                         clipcase = 2;
5036                                 }
5037                                 else
5038                                 {
5039                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5040                                         SKIPBACKFACE;
5041                                         numpoints = 3;
5042                                         clipcase = 3;
5043                                 }
5044                         }
5045                 }
5046                 else if (clipdist[1] >= 0.0f)
5047                 {
5048                         if (clipdist[2] >= 0.0f)
5049                         {
5050                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5051                                 SKIPBACKFACE;
5052                                 numpoints = 4;
5053                                 clipcase = 4;
5054                         }
5055                         else
5056                         {
5057                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5058                                 SKIPBACKFACE;
5059                                 numpoints = 3;
5060                                 clipcase = 5;
5061                         }
5062                 }
5063                 else if (clipdist[2] >= 0.0f)
5064                 {
5065                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5066                         SKIPBACKFACE;
5067                         numpoints = 3;
5068                         clipcase = 6;
5069                 }
5070                 else continue; // triangle is entirely behind nearplane
5071
5072                 {
5073                         // calculate integer y coords for triangle points
5074                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5075                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5076                                         screenmin = _mm_min_epi16(screeni, screenir),
5077                                         screenmax = _mm_max_epi16(screeni, screenir);
5078                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5079                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5080                         screenmin = _mm_max_epi16(screenmin, fbmin);
5081                         screenmax = _mm_min_epi16(screenmax, fbmax);
5082                         // skip offscreen triangles
5083                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5084                                 continue;
5085                         starty = _mm_extract_epi16(screenmin, 1);
5086                         endy = _mm_extract_epi16(screenmax, 1)+1;
5087                         if (starty >= maxy1 && endy <= miny2)
5088                                 continue;
5089                         screeny = _mm_srai_epi32(screeni, 16);
5090                 }
5091
5092                 triangle = &thread->triangles[thread->numtriangles];
5093
5094                 // calculate attribute plans for triangle data...
5095                 // okay, this triangle is going to produce spans, we'd better project
5096                 // the interpolants now (this is what gives perspective texturing),
5097                 // this consists of simply multiplying all arrays by the W coord
5098                 // (which is basically 1/Z), which will be undone per-pixel
5099                 // (multiplying by Z again) to get the perspective-correct array
5100                 // values
5101                 {
5102                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5103                         __m128 mipedgescale, mipdensity;
5104                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5105                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5106                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5107                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5108                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5109                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5110                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5111                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5112                         attribedge1 = _mm_sub_ss(w0, w1);
5113                         attribedge2 = _mm_sub_ss(w2, w1);
5114                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5115                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5116                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5117                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5118                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5119                         _mm_store_ss(&triangle->w[0], attribxslope);
5120                         _mm_store_ss(&triangle->w[1], attribyslope);
5121                         _mm_store_ss(&triangle->w[2], attriborigin);
5122                         
5123                         clip0origin = 0;
5124                         clip0slope = 0;
5125                         clip0dir = 0;
5126                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5127                         {
5128                                 float cliporigin, clipxslope, clipyslope;
5129                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5130                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5131                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5132                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5133                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5134                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5135                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5136                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5137                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5138                                 if(clipxslope != 0)
5139                                 {
5140                                         clip0origin = -cliporigin/clipxslope;
5141                                         clip0slope = -clipyslope/clipxslope;
5142                                         clip0dir = clipxslope > 0 ? 1 : -1;
5143                                 }
5144                                 else if(clipyslope > 0)
5145                                 {
5146                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5147                                         clip0slope = dpsoftrast.fb_width;
5148                                         clip0dir = -1;
5149                                 }
5150                                 else if(clipyslope < 0)
5151                                 {
5152                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5153                                         clip0slope = -dpsoftrast.fb_width;
5154                                         clip0dir = -1;
5155                                 }
5156                                 else if(clip0origin < 0) continue;
5157                         }
5158
5159                         mipedgescale = _mm_setzero_ps();
5160                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5161                         {
5162                                 __m128 attrib0, attrib1, attrib2;
5163                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5164                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5165                                         break;
5166                                 arrays += numvertices*4;
5167                                 GENATTRIBS(attrib0, attrib1, attrib2);
5168                                 attriborigin = _mm_mul_ps(attrib1, w1);
5169                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5170                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5171                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5172                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5173                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5174                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5175                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5176                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5177                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5178                                 {
5179                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5180                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5181                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5182                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5183                                 }
5184                         }
5185
5186                         memset(triangle->mip, 0, sizeof(triangle->mip));
5187                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5188                         {
5189                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5190                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5191                                         break;
5192                                 texture = thread->texbound[texunit];
5193                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5194                                 {
5195                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5196                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5197                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5198                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5199                                         // this will be multiplied in the texturing routine by the texture resolution
5200                                         y = _mm_cvtss_si32(mipdensity);
5201                                         if (y > 0)
5202                                         {
5203                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5204                                                 if (y > texture->mipmaps - 1)
5205                                                         y = texture->mipmaps - 1;
5206                                                 triangle->mip[texunit] = y;
5207                                         }
5208                                 }
5209                         }
5210                 }
5211         
5212                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5213                 for (; y < bandy;)
5214                 {
5215                         __m128 xcoords, xslope;
5216                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5217                         int yccmask = _mm_movemask_epi8(ycc);
5218                         int edge0p, edge0n, edge1p, edge1n;
5219                         int nexty;
5220                         float w, wslope;
5221                         float clip0;
5222                         if (numpoints == 4)
5223                         {
5224                                 switch(yccmask)
5225                                 {
5226                                 default:
5227                                 case 0xFFFF: /*0000*/ y = endy; continue;
5228                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5229                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5230                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5231                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5232                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5233                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5234                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5235                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5236                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5237                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5238                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5239                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5240                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5241                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5242                                 case 0x0000: /*1111*/ y++; continue;
5243                                 }
5244                         }
5245                         else
5246                         {
5247                                 switch(yccmask)
5248                                 {
5249                                 default:
5250                                 case 0xFFFF: /*000*/ y = endy; continue;
5251                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5252                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5253                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5254                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5255                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5256                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5257                                 case 0x0000: /*111*/ y++; continue;
5258                                 }
5259                         }
5260                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5261                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5262                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5263                         nexty = _mm_extract_epi16(ycc, 0);
5264                         if (nexty >= bandy) nexty = bandy-1;
5265                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5266                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5267                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5268                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5269                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5270                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5271                         {
5272                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5273                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5274                         }
5275                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5276                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5277                         {
5278                                 int startx, endx, offset;
5279                                 startx = _mm_cvtss_si32(xcoords);
5280                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5281                                 if (startx < minx) startx = minx;
5282                                 if (endx > maxx) endx = maxx;
5283                                 if (startx >= endx) continue;
5284
5285                                 if (clip0dir)
5286                                 {
5287                                         if (clip0dir > 0)
5288                                         {
5289                                                 if (startx < clip0) 
5290                                                 {
5291                                                         if(endx <= clip0) continue;
5292                                                         startx = (int)clip0;
5293                                                 }
5294                                         }
5295                                         else if (endx > clip0) 
5296                                         {
5297                                                 if(startx >= clip0) continue;
5298                                                 endx = (int)clip0;
5299                                         }
5300                                 }
5301                                                 
5302                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5303                                 {
5304                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5305                                         span->triangle = thread->numtriangles;
5306                                         span->x = offset;
5307                                         span->y = y;
5308                                         span->startx = 0;
5309                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5310                                         if (span->startx >= span->endx)
5311                                                 continue;
5312                                         wslope = triangle->w[0];
5313                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5314                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5315                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5316                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5317                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5318                                 }
5319                         }
5320                 }
5321
5322                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5323                 {
5324                         DPSOFTRAST_Draw_ProcessSpans(thread);
5325                         thread->numtriangles = 0;
5326                 }
5327         }
5328
5329         if (!ATOMIC_DECREMENT(command->refcount))
5330         {
5331                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5332                         MM_FREE(command->arrays);
5333         }
5334
5335         if (thread->numspans > 0 || thread->numtriangles > 0)
5336         {
5337                 DPSOFTRAST_Draw_ProcessSpans(thread);
5338                 thread->numtriangles = 0;
5339         }
5340 #endif
5341 }
5342
5343 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5344 {
5345         int i;
5346         int j;
5347         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5348         int datasize = 2*numvertices*sizeof(float[4]);
5349         DPSOFTRAST_Command_Draw *command;
5350         unsigned char *data;
5351         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5352         {
5353                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5354                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5355                         break;
5356                 datasize += numvertices*sizeof(float[4]);
5357         }
5358         if (element3s)
5359                 datasize += numtriangles*sizeof(unsigned short[3]);
5360         else if (element3i)
5361                 datasize += numtriangles*sizeof(int[3]);
5362         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5363         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5364         {
5365                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5366                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5367         }
5368         else
5369         {
5370                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5371                 data = (unsigned char *)command + commandsize;
5372         }
5373         command->firstvertex = firstvertex;
5374         command->numvertices = numvertices;
5375         command->numtriangles = numtriangles;
5376         command->arrays = (float *)data;
5377         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5378         dpsoftrast.firstvertex = firstvertex;
5379         dpsoftrast.numvertices = numvertices;
5380         dpsoftrast.screencoord4f = (float *)data;
5381         data += numvertices*sizeof(float[4]);
5382         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5383         data += numvertices*sizeof(float[4]);
5384         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5385         {
5386                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5387                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5388                         break;
5389                 dpsoftrast.post_array4f[j] = (float *)data;
5390                 data += numvertices*sizeof(float[4]);
5391         }
5392         command->element3i = NULL;
5393         command->element3s = NULL;
5394         if (element3s)
5395         {
5396                 command->element3s = (unsigned short *)data;
5397                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5398         }
5399         else if (element3i)
5400         {
5401                 command->element3i = (int *)data;
5402                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5403         }
5404         return command;
5405 }
5406
5407 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5408 {
5409         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5410         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5411         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5412         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5413         if (command->starty >= command->endy)
5414         {
5415                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5416                         MM_FREE(command->arrays);
5417                 DPSOFTRAST_UndoCommand(command->commandsize);
5418                 return;
5419         }
5420         command->clipped = dpsoftrast.drawclipped;
5421         command->refcount = dpsoftrast.numthreads;
5422
5423         if (dpsoftrast.usethreads)
5424         {
5425                 int i;
5426                 DPSOFTRAST_Draw_SyncCommands();
5427                 for (i = 0; i < dpsoftrast.numthreads; i++)
5428                 {
5429                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5430                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5431                                 Thread_CondSignal(thread->drawcond);
5432                 }
5433         }
5434         else
5435         {
5436                 DPSOFTRAST_Draw_FlushThreads();
5437         }
5438 }
5439
5440 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5441 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5442 {
5443         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5444 }
5445 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5446 {
5447         DPSOFTRAST_Command_SetRenderTargets *command;
5448         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5449                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5450                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5451                 DPSOFTRAST_Flush();
5452         dpsoftrast.fb_width = width;
5453         dpsoftrast.fb_height = height;
5454         dpsoftrast.fb_depthpixels = depthpixels;
5455         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5456         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5457         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5458         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5459         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5460         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5461         command->width = width;
5462         command->height = height;
5463 }
5464  
5465 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5466 {
5467         int commandoffset = thread->commandoffset;
5468         while (commandoffset != endoffset)
5469         {
5470                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5471                 switch (command->opcode)
5472                 {
5473 #define INTERPCOMMAND(name) \
5474                 case DPSOFTRAST_OPCODE_##name : \
5475                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5476                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5477                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5478                                 commandoffset = 0; \
5479                         break;
5480                 INTERPCOMMAND(Viewport)
5481                 INTERPCOMMAND(ClearColor)
5482                 INTERPCOMMAND(ClearDepth)
5483                 INTERPCOMMAND(ColorMask)
5484                 INTERPCOMMAND(DepthTest)
5485                 INTERPCOMMAND(ScissorTest)
5486                 INTERPCOMMAND(Scissor)
5487                 INTERPCOMMAND(BlendFunc)
5488                 INTERPCOMMAND(BlendSubtract)
5489                 INTERPCOMMAND(DepthMask)
5490                 INTERPCOMMAND(DepthFunc)
5491                 INTERPCOMMAND(DepthRange)
5492                 INTERPCOMMAND(PolygonOffset)
5493                 INTERPCOMMAND(CullFace)
5494                 INTERPCOMMAND(SetTexture)
5495                 INTERPCOMMAND(SetShader)
5496                 INTERPCOMMAND(Uniform4f)
5497                 INTERPCOMMAND(UniformMatrix4f)
5498                 INTERPCOMMAND(Uniform1i)
5499                 INTERPCOMMAND(SetRenderTargets)
5500                 INTERPCOMMAND(ClipPlane)
5501
5502                 case DPSOFTRAST_OPCODE_Draw:
5503                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5504                         commandoffset += command->commandsize;
5505                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5506                                 commandoffset = 0;
5507                         thread->commandoffset = commandoffset;
5508                         break;
5509
5510                 case DPSOFTRAST_OPCODE_Reset:
5511                         commandoffset = 0;
5512                         break;
5513                 }
5514         }
5515         thread->commandoffset = commandoffset;
5516 }
5517
5518 static int DPSOFTRAST_Draw_Thread(void *data)
5519 {
5520         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5521         while(thread->index >= 0)
5522         {
5523                 if (thread->commandoffset != dpsoftrast.drawcommand)
5524                 {
5525                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5526                 }
5527                 else 
5528                 {
5529                         Thread_LockMutex(thread->drawmutex);
5530                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5531                         {
5532                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5533                                 thread->starving = true;
5534                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5535                                 thread->starving = false;
5536                         }
5537                         Thread_UnlockMutex(thread->drawmutex);
5538                 }
5539         }   
5540         return 0;
5541 }
5542
5543 static void DPSOFTRAST_Draw_FlushThreads(void)
5544 {
5545         DPSOFTRAST_State_Thread *thread;
5546         int i;
5547         DPSOFTRAST_Draw_SyncCommands();
5548         if (dpsoftrast.usethreads) 
5549         {
5550                 for (i = 0; i < dpsoftrast.numthreads; i++)
5551                 {
5552                         thread = &dpsoftrast.threads[i];
5553                         if (thread->commandoffset != dpsoftrast.drawcommand)
5554                         {
5555                                 Thread_LockMutex(thread->drawmutex);
5556                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5557                                         Thread_CondSignal(thread->drawcond);
5558                                 Thread_UnlockMutex(thread->drawmutex);
5559                         }
5560                 }
5561                 for (i = 0; i < dpsoftrast.numthreads; i++)
5562                 {
5563                         thread = &dpsoftrast.threads[i];
5564                         if (thread->commandoffset != dpsoftrast.drawcommand)
5565                         {
5566                                 Thread_LockMutex(thread->drawmutex);
5567                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5568                                 {
5569                                         thread->waiting = true;
5570                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5571                                         thread->waiting = false;
5572                                 }
5573                                 Thread_UnlockMutex(thread->drawmutex);
5574                         }
5575                 }
5576         }
5577         else
5578         {
5579                 for (i = 0; i < dpsoftrast.numthreads; i++)
5580                 {
5581                         thread = &dpsoftrast.threads[i];
5582                         if (thread->commandoffset != dpsoftrast.drawcommand)
5583                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5584                 }
5585         }
5586         dpsoftrast.commandpool.usedcommands = 0;
5587 }
5588
5589 void DPSOFTRAST_Flush(void)
5590 {
5591         DPSOFTRAST_Draw_FlushThreads();
5592 }
5593
5594 void DPSOFTRAST_Finish(void)
5595 {
5596         DPSOFTRAST_Flush();
5597 }
5598
5599 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5600 {
5601         int i;
5602         union
5603         {
5604                 int i;
5605                 unsigned char b[4];
5606         }
5607         u;
5608         u.i = 1;
5609         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5610         dpsoftrast.bigendian = u.b[3];
5611         dpsoftrast.fb_width = width;
5612         dpsoftrast.fb_height = height;
5613         dpsoftrast.fb_depthpixels = depthpixels;
5614         dpsoftrast.fb_colorpixels[0] = colorpixels;
5615         dpsoftrast.fb_colorpixels[1] = NULL;
5616         dpsoftrast.fb_colorpixels[1] = NULL;
5617         dpsoftrast.fb_colorpixels[1] = NULL;
5618         dpsoftrast.viewport[0] = 0;
5619         dpsoftrast.viewport[1] = 0;
5620         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5621         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5622         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5623         dpsoftrast.texture_firstfree = 1;
5624         dpsoftrast.texture_end = 1;
5625         dpsoftrast.texture_max = 0;
5626         dpsoftrast.color[0] = 1;
5627         dpsoftrast.color[1] = 1;
5628         dpsoftrast.color[2] = 1;
5629         dpsoftrast.color[3] = 1;
5630         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5631         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5632         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5633         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5634         for (i = 0; i < dpsoftrast.numthreads; i++)
5635         {
5636                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5637                 thread->index = i;
5638                 thread->cullface = GL_BACK;
5639         thread->colormask[0] = 1; 
5640                 thread->colormask[1] = 1;
5641                 thread->colormask[2] = 1;
5642                 thread->colormask[3] = 1;
5643                 thread->blendfunc[0] = GL_ONE;
5644                 thread->blendfunc[1] = GL_ZERO;
5645                 thread->depthmask = true;
5646                 thread->depthtest = true;
5647                 thread->depthfunc = GL_LEQUAL;
5648                 thread->scissortest = false;
5649                 thread->viewport[0] = 0;
5650                 thread->viewport[1] = 0;
5651                 thread->viewport[2] = dpsoftrast.fb_width;
5652                 thread->viewport[3] = dpsoftrast.fb_height;
5653                 thread->scissor[0] = 0;
5654                 thread->scissor[1] = 0;
5655                 thread->scissor[2] = dpsoftrast.fb_width;
5656                 thread->scissor[3] = dpsoftrast.fb_height;
5657                 thread->depthrange[0] = 0;
5658                 thread->depthrange[1] = 1;
5659                 thread->polygonoffset[0] = 0;
5660                 thread->polygonoffset[1] = 0;
5661                 thread->clipplane[0] = 0;
5662                 thread->clipplane[1] = 0;
5663                 thread->clipplane[2] = 0;
5664                 thread->clipplane[3] = 1;
5665         
5666                 thread->numspans = 0;
5667                 thread->numtriangles = 0;
5668                 thread->commandoffset = 0;
5669                 thread->waiting = false;
5670                 thread->starving = false;
5671            
5672                 thread->validate = -1;
5673                 DPSOFTRAST_Validate(thread, -1);
5674  
5675                 if (dpsoftrast.usethreads)
5676                 {
5677                         thread->waitcond = Thread_CreateCond();
5678                         thread->drawcond = Thread_CreateCond();
5679                         thread->drawmutex = Thread_CreateMutex();
5680                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5681                 }
5682         }
5683         return 0;
5684 }
5685
5686 void DPSOFTRAST_Shutdown(void)
5687 {
5688         int i;
5689         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5690         {
5691                 DPSOFTRAST_State_Thread *thread;
5692                 for (i = 0; i < dpsoftrast.numthreads; i++)
5693                 {
5694                         thread = &dpsoftrast.threads[i];
5695                         Thread_LockMutex(thread->drawmutex);
5696                         thread->index = -1;
5697                         Thread_CondSignal(thread->drawcond);
5698                         Thread_UnlockMutex(thread->drawmutex);
5699                         Thread_WaitThread(thread->thread, 0);
5700                         Thread_DestroyCond(thread->waitcond);
5701                         Thread_DestroyCond(thread->drawcond);
5702                         Thread_DestroyMutex(thread->drawmutex);
5703                 }
5704         }
5705         for (i = 0;i < dpsoftrast.texture_end;i++)
5706                 if (dpsoftrast.texture[i].bytes)
5707                         MM_FREE(dpsoftrast.texture[i].bytes);
5708         if (dpsoftrast.texture)
5709                 free(dpsoftrast.texture);
5710         if (dpsoftrast.threads)
5711                 MM_FREE(dpsoftrast.threads);
5712         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5713 }
5714