]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
buffer csprogs downloads and load csprogs from the buffer instead of a file, if available
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int viewport[4];
253         int scissor[4];
254         float depthrange[2];
255         float polygonoffset[2];
256         float clipplane[4];
257         ALIGN(float fb_clipplane[4]);
258
259         int shader_mode;
260         int shader_permutation;
261         int shader_exactspecularmath;
262
263         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
264         
265         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
267
268         // DPSOFTRAST_VALIDATE_ flags
269         int validate;
270
271         // derived values (DPSOFTRAST_VALIDATE_FB)
272         int fb_colormask;
273         int fb_scissor[4];
274         ALIGN(float fb_viewportcenter[4]);
275         ALIGN(float fb_viewportscale[4]);
276
277         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278         int fb_depthfunc;
279
280         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
281         int fb_blendmode;
282
283         // band boundaries
284         int miny1;
285         int maxy1;
286         int miny2;
287         int maxy2;
288
289         ATOMIC(volatile int commandoffset);
290
291         volatile bool waiting;
292         volatile bool starving;
293         void *waitcond;
294         void *drawcond;
295         void *drawmutex;
296
297         int numspans;
298         int numtriangles;
299         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
302 }
303 DPSOFTRAST_State_Thread);
304
305 typedef ALIGN(struct DPSOFTRAST_State_s
306 {
307         int fb_width;
308         int fb_height;
309         unsigned int *fb_depthpixels;
310         unsigned int *fb_colorpixels[4];
311
312         int viewport[4];
313         ALIGN(float fb_viewportcenter[4]);
314         ALIGN(float fb_viewportscale[4]);
315
316         float color[4];
317         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
319
320         const float *pointer_vertex3f;
321         const float *pointer_color4f;
322         const unsigned char *pointer_color4ub;
323         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324         int stride_vertex;
325         int stride_color;
326         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
329
330         int firstvertex;
331         int numvertices;
332         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333         float *screencoord4f;
334         int drawstarty;
335         int drawendy;
336         int drawclipped;
337         
338         int shader_mode;
339         int shader_permutation;
340         int shader_exactspecularmath;
341
342         int texture_max;
343         int texture_end;
344         int texture_firstfree;
345         DPSOFTRAST_Texture *texture;
346
347         int bigendian;
348
349         // error reporting
350         const char *errorstring;
351
352         bool usethreads;
353         int interlace;
354         int numthreads;
355         DPSOFTRAST_State_Thread *threads;
356
357         ATOMIC(volatile int drawcommand);
358
359         DPSOFTRAST_State_Command_Pool commandpool;
360 }
361 DPSOFTRAST_State);
362
363 DPSOFTRAST_State dpsoftrast;
364
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
369
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
372
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
374 {
375         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377         fb_viewportcenter[3] = 0.5f;
378         fb_viewportcenter[0] = 0.0f;
379         fb_viewportscale[1] = 0.5f * viewport[2];
380         fb_viewportscale[2] = -0.5f * viewport[3];
381         fb_viewportscale[3] = 0.5f;
382         fb_viewportscale[0] = 1.0f;
383 }
384
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
386 {
387         if (dpsoftrast.interlace)
388         {
389                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393         }
394         else
395         {
396                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
398         }
399 }
400
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
402 {
403         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 }
409
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
411 {
412         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413         // and viewport projection values
414         int x1, x2;
415         int y1, y2;
416         x1 = thread->scissor[0];
417         x2 = thread->scissor[0] + thread->scissor[2];
418         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419         y2 = dpsoftrast.fb_height - thread->scissor[1];
420         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
421         if (x1 < 0) x1 = 0;
422         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
423         if (y1 < 0) y1 = 0;
424         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425         thread->fb_scissor[0] = x1;
426         thread->fb_scissor[1] = y1;
427         thread->fb_scissor[2] = x2 - x1;
428         thread->fb_scissor[3] = y2 - y1;
429
430         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431         DPSOFTRAST_RecalcClipPlane(thread);
432         DPSOFTRAST_RecalcThread(thread);
433 }
434
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
436 {
437         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 }
439
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
441 {
442         if (thread->blendsubtract)
443         {
444                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445                 {
446                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450                 }
451         }
452         else
453         {       
454                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
455                 {
456                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
467                 }
468         }
469 }
470
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
472
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
474 {
475         mask &= thread->validate;
476         if (!mask)
477                 return;
478         if (mask & DPSOFTRAST_VALIDATE_FB)
479         {
480                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481                 DPSOFTRAST_RecalcFB(thread);
482         }
483         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
484         {
485                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486                 DPSOFTRAST_RecalcDepthFunc(thread);
487         }
488         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
489         {
490                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491                 DPSOFTRAST_RecalcBlendFunc(thread);
492         }
493 }
494
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
496 {
497         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498                 return &dpsoftrast.texture[index];
499         return NULL;
500 }
501
502 static void DPSOFTRAST_Texture_Grow(void)
503 {
504         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505         DPSOFTRAST_State_Thread *thread;
506         int i;
507         int j;
508         DPSOFTRAST_Flush();
509         // expand texture array as needed
510         if (dpsoftrast.texture_max < 1024)
511                 dpsoftrast.texture_max = 1024;
512         else
513                 dpsoftrast.texture_max *= 2;
514         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516                 if (dpsoftrast.texbound[i])
517                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518         for (j = 0; j < dpsoftrast.numthreads; j++)
519         {
520                 thread = &dpsoftrast.threads[j];
521                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522                         if (thread->texbound[i])
523                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
524         }
525 }
526
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
528 {
529         int w;
530         int h;
531         int d;
532         int size;
533         int s;
534         int texnum;
535         int mipmaps;
536         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538         DPSOFTRAST_Texture *texture;
539         if (width*height*depth < 1)
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542                 return 0;
543         }
544         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
545         {
546                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
547                 return 0;
548         }
549         switch(texformat)
550         {
551         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
554                 break;
555         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
557                 {
558                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
559                         return 0;
560                 }
561                 if (depth != 1)
562                 {
563                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564                         return 0;
565                 }
566                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
567                 {
568                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
569                         return 0;
570                 }
571                 break;
572         }
573         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576                 return 0;
577         }
578         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581                 return 0;
582         }
583         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
584         {
585                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586                 return 0;
587         }
588         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
589         {
590                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591                 return 0;
592         }
593         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
594         {
595                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596                 return 0;
597         }
598         // find first empty slot in texture array
599         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600                 if (!dpsoftrast.texture[texnum].bytes)
601                         break;
602         dpsoftrast.texture_firstfree = texnum + 1;
603         if (dpsoftrast.texture_max <= texnum)
604                 DPSOFTRAST_Texture_Grow();
605         if (dpsoftrast.texture_end <= texnum)
606                 dpsoftrast.texture_end = texnum + 1;
607         texture = &dpsoftrast.texture[texnum];
608         memset(texture, 0, sizeof(*texture));
609         texture->flags = flags;
610         texture->width = width;
611         texture->height = height;
612         texture->depth = depth;
613         texture->sides = sides;
614         texture->binds = 0;
615         w = width;
616         h = height;
617         d = depth;
618         size = 0;
619         mipmaps = 0;
620         for (;;)
621         {
622                 s = w * h * d * sides * 4;
623                 texture->mipmap[mipmaps][0] = size;
624                 texture->mipmap[mipmaps][1] = s;
625                 texture->mipmap[mipmaps][2] = w;
626                 texture->mipmap[mipmaps][3] = h;
627                 texture->mipmap[mipmaps][4] = d;
628                 size += s;
629                 mipmaps++;
630                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
631                         break;
632                 if (w > 1) w >>= 1;
633                 if (h > 1) h >>= 1;
634                 if (d > 1) d >>= 1;
635         }
636         texture->mipmaps = mipmaps;
637         texture->size = size;
638
639         // allocate the pixels now
640         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
641
642         return texnum;
643 }
644 void DPSOFTRAST_Texture_Free(int index)
645 {
646         DPSOFTRAST_Texture *texture;
647         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
648         if (texture->binds)
649                 DPSOFTRAST_Flush();
650         if (texture->bytes)
651                 MM_FREE(texture->bytes);
652         texture->bytes = NULL;
653         memset(texture, 0, sizeof(*texture));
654         // adjust the free range and used range
655         if (dpsoftrast.texture_firstfree > index)
656                 dpsoftrast.texture_firstfree = index;
657         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
658                 dpsoftrast.texture_end--;
659 }
660 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
661 {
662         int i, x, y, z, w, layer0, layer1, row0, row1;
663         unsigned char *o, *i0, *i1, *i2, *i3;
664         DPSOFTRAST_Texture *texture;
665         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
666         if (texture->mipmaps <= 1)
667                 return;
668         for (i = 1;i < texture->mipmaps;i++)
669         {
670                 for (z = 0;z < texture->mipmap[i][4];z++)
671                 {
672                         layer0 = z*2;
673                         layer1 = z*2+1;
674                         if (layer1 >= texture->mipmap[i-1][4])
675                                 layer1 = texture->mipmap[i-1][4]-1;
676                         for (y = 0;y < texture->mipmap[i][3];y++)
677                         {
678                                 row0 = y*2;
679                                 row1 = y*2+1;
680                                 if (row1 >= texture->mipmap[i-1][3])
681                                         row1 = texture->mipmap[i-1][3]-1;
682                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
683                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
684                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
685                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
686                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
687                                 w = texture->mipmap[i][2];
688                                 if (layer1 > layer0)
689                                 {
690                                         if (texture->mipmap[i-1][2] > 1)
691                                         {
692                                                 // average 3D texture
693                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
694                                                 {
695                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
696                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
697                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
698                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
699                                                 }
700                                         }
701                                         else
702                                         {
703                                                 // average 3D mipmap with parent width == 1
704                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
705                                                 {
706                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
707                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
708                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
709                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
710                                                 }
711                                         }
712                                 }
713                                 else
714                                 {
715                                         if (texture->mipmap[i-1][2] > 1)
716                                         {
717                                                 // average 2D texture (common case)
718                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
719                                                 {
720                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
721                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
722                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
723                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
724                                                 }
725                                         }
726                                         else
727                                         {
728                                                 // 2D texture with parent width == 1
729                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
730                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
731                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
732                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
733                                         }
734                                 }
735                         }
736                 }
737         }
738 }
739 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
740 {
741         DPSOFTRAST_Texture *texture;
742         unsigned char *dst;
743         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
744         if (texture->binds)
745                 DPSOFTRAST_Flush();
746         if (pixels)
747         {
748                 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
749                 while (blockheight > 0)
750                 {
751                         dst -= texture->mipmap[0][2] * 4;
752                         memcpy(dst, pixels, blockwidth * 4);
753                         pixels += blockwidth * 4;
754                         blockheight--;
755                 }
756         }
757         DPSOFTRAST_Texture_CalculateMipmaps(index);
758 }
759 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
760 {
761         DPSOFTRAST_Texture *texture;
762         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
763         if (texture->binds)
764                 DPSOFTRAST_Flush();
765         if (pixels)
766         {
767                 int i, stride = texture->mipmap[0][2]*4;
768                 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
769                 for (i = texture->mipmap[0][3];i > 0;i--)
770                 {
771                         dst -= stride;
772                         memcpy(dst, pixels, stride);
773                         pixels += stride;
774                 }
775         }
776         DPSOFTRAST_Texture_CalculateMipmaps(index);
777 }
778 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
779 {
780         DPSOFTRAST_Texture *texture;
781         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782         return texture->mipmap[mip][2];
783 }
784 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
785 {
786         DPSOFTRAST_Texture *texture;
787         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788         return texture->mipmap[mip][3];
789 }
790 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
791 {
792         DPSOFTRAST_Texture *texture;
793         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
794         return texture->mipmap[mip][4];
795 }
796 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
797 {
798         DPSOFTRAST_Texture *texture;
799         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
800         if (texture->binds)
801                 DPSOFTRAST_Flush();
802         return texture->bytes + texture->mipmap[mip][0];
803 }
804 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
805 {
806         DPSOFTRAST_Texture *texture;
807         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
808         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
809         {
810                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
811                 return;
812         }
813         if (texture->binds)
814                 DPSOFTRAST_Flush();
815         texture->filter = filter;
816 }
817
818 static void DPSOFTRAST_Draw_FlushThreads(void);
819
820 static void DPSOFTRAST_Draw_SyncCommands(void)
821 {
822         if(dpsoftrast.usethreads) MEMORY_BARRIER;
823         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
824 }
825
826 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
827 {
828         DPSOFTRAST_State_Thread *thread;
829         int i;
830         int freecommand = dpsoftrast.commandpool.freecommand;
831         int usedcommands = dpsoftrast.commandpool.usedcommands;
832         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
833                 return;
834         DPSOFTRAST_Draw_SyncCommands();
835         for(;;)
836         {
837                 int waitindex = -1;
838                 int commandoffset;
839                 usedcommands = 0;
840                 for (i = 0; i < dpsoftrast.numthreads; i++)
841                 {
842                         thread = &dpsoftrast.threads[i]; 
843                         commandoffset = freecommand - thread->commandoffset;
844                         if (commandoffset < 0)
845                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
846                         if (commandoffset > usedcommands)
847                         {
848                                 waitindex = i;
849                                 usedcommands = commandoffset;
850                         }
851                 }
852                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
853                         break;
854                 thread = &dpsoftrast.threads[waitindex];
855                 Thread_LockMutex(thread->drawmutex);
856                 if (thread->commandoffset != dpsoftrast.drawcommand)
857                 {
858                         thread->waiting = true;
859                         if (thread->starving) Thread_CondSignal(thread->drawcond);
860                         Thread_CondWait(thread->waitcond, thread->drawmutex);
861                         thread->waiting = false;
862                 }
863                 Thread_UnlockMutex(thread->drawmutex);
864         }
865         dpsoftrast.commandpool.usedcommands = usedcommands;
866 }
867
868 #define DPSOFTRAST_ALIGNCOMMAND(size) \
869         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
870 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
871         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
872
873 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
874 {
875         DPSOFTRAST_Command *command;
876         int freecommand = dpsoftrast.commandpool.freecommand;
877         int usedcommands = dpsoftrast.commandpool.usedcommands;
878         int extra = sizeof(DPSOFTRAST_Command);
879         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
880                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
881         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
882         {
883                 if (dpsoftrast.usethreads)
884                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
885                 else
886                         DPSOFTRAST_Draw_FlushThreads();
887                 freecommand = dpsoftrast.commandpool.freecommand;
888                 usedcommands = dpsoftrast.commandpool.usedcommands;
889         }
890         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
891         {
892                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
893                 command->opcode = DPSOFTRAST_OPCODE_Reset;
894                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
895                 freecommand = 0;
896         }
897         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
898         command->opcode = opcode;
899         command->commandsize = size;
900         freecommand += size;
901         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
902                 freecommand = 0;
903         dpsoftrast.commandpool.freecommand = freecommand;
904         dpsoftrast.commandpool.usedcommands = usedcommands + size;
905         return command;
906 }
907
908 static void DPSOFTRAST_UndoCommand(int size)
909 {
910         int freecommand = dpsoftrast.commandpool.freecommand;
911         int usedcommands = dpsoftrast.commandpool.usedcommands;
912         freecommand -= size;
913         if (freecommand < 0)
914                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
915         usedcommands -= size;
916         dpsoftrast.commandpool.freecommand = freecommand;
917         dpsoftrast.commandpool.usedcommands = usedcommands;
918 }
919                 
920 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
921 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
922 {
923         thread->viewport[0] = command->x;
924         thread->viewport[1] = command->y;
925         thread->viewport[2] = command->width;
926         thread->viewport[3] = command->height;
927         thread->validate |= DPSOFTRAST_VALIDATE_FB;
928 }
929 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
930 {
931         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
932         command->x = x;
933         command->y = y;
934         command->width = width;
935         command->height = height;
936
937         dpsoftrast.viewport[0] = x;
938         dpsoftrast.viewport[1] = y;
939         dpsoftrast.viewport[2] = width;
940         dpsoftrast.viewport[3] = height;
941         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
942 }
943
944 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
945 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
946 {
947         int i, x1, y1, x2, y2, w, h, x, y;
948         int miny1, maxy1, miny2, maxy2;
949         int bandy;
950         unsigned int *p;
951         unsigned int c;
952         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
953         miny1 = thread->miny1;
954         maxy1 = thread->maxy1;
955         miny2 = thread->miny2;
956         maxy2 = thread->maxy2;
957         x1 = thread->fb_scissor[0];
958         y1 = thread->fb_scissor[1];
959         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
960         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
961         if (y1 < miny1) y1 = miny1;
962         if (y2 > maxy2) y2 = maxy2;
963         w = x2 - x1;
964         h = y2 - y1;
965         if (w < 1 || h < 1)
966                 return;
967         // FIXME: honor fb_colormask?
968         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
969         for (i = 0;i < 4;i++)
970         {
971                 if (!dpsoftrast.fb_colorpixels[i])
972                         continue;
973                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
974                 for (;y < bandy;y++)
975                 {
976                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
977                         for (x = x1;x < x2;x++)
978                                 p[x] = c;
979                 }
980         }
981 }
982 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
983 {
984         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
985         command->r = r;
986         command->g = g;
987         command->b = b;
988         command->a = a;
989 }
990
991 DEFCOMMAND(3, ClearDepth, float depth;)
992 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
993 {
994         int x1, y1, x2, y2, w, h, x, y;
995         int miny1, maxy1, miny2, maxy2;
996         int bandy;
997         unsigned int *p;
998         unsigned int c;
999         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1000         miny1 = thread->miny1;
1001         maxy1 = thread->maxy1;
1002         miny2 = thread->miny2;
1003         maxy2 = thread->maxy2;
1004         x1 = thread->fb_scissor[0];
1005         y1 = thread->fb_scissor[1];
1006         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1007         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1008         if (y1 < miny1) y1 = miny1;
1009         if (y2 > maxy2) y2 = maxy2;
1010         w = x2 - x1;
1011         h = y2 - y1;
1012         if (w < 1 || h < 1)
1013                 return;
1014         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1015         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1016         for (;y < bandy;y++)
1017         {
1018                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1019                 for (x = x1;x < x2;x++)
1020                         p[x] = c;
1021         }
1022 }
1023 void DPSOFTRAST_ClearDepth(float d)
1024 {
1025         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1026         command->depth = d;
1027 }
1028
1029 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1030 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1031 {
1032         thread->colormask[0] = command->r != 0;
1033         thread->colormask[1] = command->g != 0;
1034         thread->colormask[2] = command->b != 0;
1035         thread->colormask[3] = command->a != 0;
1036         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1037 }
1038 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1039 {
1040         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1041         command->r = r;
1042         command->g = g;
1043         command->b = b;
1044         command->a = a;
1045 }
1046
1047 DEFCOMMAND(5, DepthTest, int enable;)
1048 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1049 {
1050         thread->depthtest = command->enable;
1051         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1052 }
1053 void DPSOFTRAST_DepthTest(int enable)
1054 {
1055         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1056         command->enable = enable;
1057 }
1058
1059 DEFCOMMAND(6, ScissorTest, int enable;)
1060 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1061 {
1062         thread->scissortest = command->enable;
1063         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1064 }
1065 void DPSOFTRAST_ScissorTest(int enable)
1066 {
1067         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1068         command->enable = enable;
1069 }
1070
1071 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1072 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1073 {
1074         thread->scissor[0] = command->x;
1075         thread->scissor[1] = command->y;
1076         thread->scissor[2] = command->width;
1077         thread->scissor[3] = command->height;
1078         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1079 }
1080 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1081 {
1082         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1083         command->x = x;
1084         command->y = y;
1085         command->width = width;
1086         command->height = height;
1087 }
1088
1089 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1090 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1091 {
1092         thread->blendfunc[0] = command->sfactor;
1093         thread->blendfunc[1] = command->dfactor;
1094         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1095 }
1096 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1097 {
1098         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1099         command->sfactor = sfactor;
1100         command->dfactor = dfactor;
1101 }
1102
1103 DEFCOMMAND(9, BlendSubtract, int enable;)
1104 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1105 {
1106         thread->blendsubtract = command->enable;
1107         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1108 }
1109 void DPSOFTRAST_BlendSubtract(int enable)
1110 {
1111         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1112         command->enable = enable;
1113 }
1114
1115 DEFCOMMAND(10, DepthMask, int enable;)
1116 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1117 {
1118         thread->depthmask = command->enable;
1119 }
1120 void DPSOFTRAST_DepthMask(int enable)
1121 {
1122         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1123         command->enable = enable;
1124 }
1125
1126 DEFCOMMAND(11, DepthFunc, int func;)
1127 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1128 {
1129         thread->depthfunc = command->func;
1130 }
1131 void DPSOFTRAST_DepthFunc(int func)
1132 {
1133         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1134         command->func = func;
1135 }
1136
1137 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1138 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1139 {
1140         thread->depthrange[0] = command->nearval;
1141         thread->depthrange[1] = command->farval;
1142 }
1143 void DPSOFTRAST_DepthRange(float nearval, float farval)
1144 {
1145         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1146         command->nearval = nearval;
1147         command->farval = farval;
1148 }
1149
1150 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1151 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1152 {
1153         thread->polygonoffset[0] = command->alongnormal;
1154         thread->polygonoffset[1] = command->intoview;
1155 }
1156 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1157 {
1158         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1159         command->alongnormal = alongnormal;
1160         command->intoview = intoview;
1161 }
1162
1163 DEFCOMMAND(14, CullFace, int mode;)
1164 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1165 {
1166         thread->cullface = command->mode;
1167 }
1168 void DPSOFTRAST_CullFace(int mode)
1169 {
1170         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1171         command->mode = mode;
1172 }
1173
1174 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1175 {
1176         dpsoftrast.color[0] = r;
1177         dpsoftrast.color[1] = g;
1178         dpsoftrast.color[2] = b;
1179         dpsoftrast.color[3] = a;
1180 }
1181
1182 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1183 {
1184         int outstride = blockwidth * 4;
1185         int instride = dpsoftrast.fb_width * 4;
1186         int bx1 = blockx;
1187         int by1 = blocky;
1188         int bx2 = blockx + blockwidth;
1189         int by2 = blocky + blockheight;
1190         int bw;
1191         int x;
1192         int y;
1193         unsigned char *inpixels;
1194         unsigned char *b;
1195         unsigned char *o;
1196         DPSOFTRAST_Flush();
1197         if (bx1 < 0) bx1 = 0;
1198         if (by1 < 0) by1 = 0;
1199         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1200         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1201         bw = bx2 - bx1;
1202         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1203         if (dpsoftrast.bigendian)
1204         {
1205                 for (y = by1;y < by2;y++)
1206                 {
1207                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1208                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1209                         for (x = bx1;x < bx2;x++)
1210                         {
1211                                 o[0] = b[3];
1212                                 o[1] = b[2];
1213                                 o[2] = b[1];
1214                                 o[3] = b[0];
1215                                 o += 4;
1216                                 b += 4;
1217                         }
1218                 }
1219         }
1220         else
1221         {
1222                 for (y = by1;y < by2;y++)
1223                 {
1224                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1225                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1226                         memcpy(o, b, bw*4);
1227                 }
1228         }
1229
1230 }
1231 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1232 {
1233         int tx1 = tx;
1234         int ty1 = ty;
1235         int tx2 = tx + width;
1236         int ty2 = ty + height;
1237         int sx1 = sx;
1238         int sy1 = sy;
1239         int sx2 = sx + width;
1240         int sy2 = sy + height;
1241         int swidth;
1242         int sheight;
1243         int twidth;
1244         int theight;
1245         int sw;
1246         int sh;
1247         int tw;
1248         int th;
1249         int y;
1250         unsigned int *spixels;
1251         unsigned int *tpixels;
1252         DPSOFTRAST_Texture *texture;
1253         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1254         if (mip < 0 || mip >= texture->mipmaps) return;
1255         DPSOFTRAST_Flush();
1256         spixels = dpsoftrast.fb_colorpixels[0];
1257         swidth = dpsoftrast.fb_width;
1258         sheight = dpsoftrast.fb_height;
1259         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1260         twidth = texture->mipmap[mip][2];
1261         theight = texture->mipmap[mip][3];
1262         if (tx1 < 0) tx1 = 0;
1263         if (ty1 < 0) ty1 = 0;
1264         if (tx2 > twidth) tx2 = twidth;
1265         if (ty2 > theight) ty2 = theight;
1266         if (sx1 < 0) sx1 = 0;
1267         if (sy1 < 0) sy1 = 0;
1268         if (sx2 > swidth) sx2 = swidth;
1269         if (sy2 > sheight) sy2 = sheight;
1270         tw = tx2 - tx1;
1271         th = ty2 - ty1;
1272         sw = sx2 - sx1;
1273         sh = sy2 - sy1;
1274         if (tw > sw) tw = sw;
1275         if (th > sh) th = sh;
1276         if (tw < 1 || th < 1)
1277                 return;
1278         sy1 = sheight - sy1 - th;
1279         ty1 = theight - ty1 - th;
1280         for (y = 0;y < th;y++)
1281                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1282         if (texture->mipmaps > 1)
1283                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1284 }
1285
1286 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1287 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1288 {
1289         if (thread->texbound[command->unitnum])
1290                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1291         thread->texbound[command->unitnum] = command->texture;
1292 }
1293 void DPSOFTRAST_SetTexture(int unitnum, int index)
1294 {
1295         DPSOFTRAST_Command_SetTexture *command;
1296         DPSOFTRAST_Texture *texture;
1297         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1298         {
1299                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1300                 return;
1301         }
1302         texture = DPSOFTRAST_Texture_GetByIndex(index);
1303         if (index && !texture)
1304         {
1305                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1306                 return;
1307         }
1308
1309         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1310         command->unitnum = unitnum;
1311         command->texture = texture;
1312
1313         dpsoftrast.texbound[unitnum] = texture;
1314         if (texture)
1315                 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1316 }
1317
1318 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1319 {
1320         dpsoftrast.pointer_vertex3f = vertex3f;
1321         dpsoftrast.stride_vertex = stride;
1322 }
1323 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1324 {
1325         dpsoftrast.pointer_color4f = color4f;
1326         dpsoftrast.pointer_color4ub = NULL;
1327         dpsoftrast.stride_color = stride;
1328 }
1329 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1330 {
1331         dpsoftrast.pointer_color4f = NULL;
1332         dpsoftrast.pointer_color4ub = color4ub;
1333         dpsoftrast.stride_color = stride;
1334 }
1335 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1336 {
1337         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1338         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1339         dpsoftrast.stride_texcoord[unitnum] = stride;
1340 }
1341
1342 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1343 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1344 {
1345         thread->shader_mode = command->mode;
1346         thread->shader_permutation = command->permutation;
1347         thread->shader_exactspecularmath = command->exactspecularmath;
1348 }
1349 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1350 {
1351         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1352         command->mode = mode;
1353         command->permutation = permutation;
1354         command->exactspecularmath = exactspecularmath;
1355
1356         dpsoftrast.shader_mode = mode;
1357         dpsoftrast.shader_permutation = permutation;
1358         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1359 }
1360
1361 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1362 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1363 {
1364         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1365 }
1366 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1367 {
1368         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1369         command->index = index;
1370         command->val[0] = v0;
1371         command->val[1] = v1;
1372         command->val[2] = v2;
1373         command->val[3] = v3;
1374
1375         dpsoftrast.uniform4f[index*4+0] = v0;
1376         dpsoftrast.uniform4f[index*4+1] = v1;
1377         dpsoftrast.uniform4f[index*4+2] = v2;
1378         dpsoftrast.uniform4f[index*4+3] = v3;
1379 }
1380 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1381 {
1382         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1383         command->index = index;
1384         memcpy(command->val, v, sizeof(command->val));
1385
1386         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1387 }
1388
1389 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1390 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1391 {
1392         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1393 }
1394 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1395 {
1396 #ifdef SSE_POSSIBLE
1397         int i, index;
1398         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1399         {
1400                 __m128 m0, m1, m2, m3;
1401                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1402                 command->index = (DPSOFTRAST_UNIFORM)index;
1403                 if (((size_t)v)&(ALIGN_SIZE-1))
1404                 {
1405                         m0 = _mm_loadu_ps(v);
1406                         m1 = _mm_loadu_ps(v+4);
1407                         m2 = _mm_loadu_ps(v+8);
1408                         m3 = _mm_loadu_ps(v+12);
1409                 }
1410                 else
1411                 {
1412                         m0 = _mm_load_ps(v);
1413                         m1 = _mm_load_ps(v+4);
1414                         m2 = _mm_load_ps(v+8);
1415                         m3 = _mm_load_ps(v+12);
1416                 }
1417                 if (transpose)
1418                 {
1419                         __m128 t0, t1, t2, t3;
1420                         t0 = _mm_unpacklo_ps(m0, m1);
1421                         t1 = _mm_unpacklo_ps(m2, m3);
1422                         t2 = _mm_unpackhi_ps(m0, m1);
1423                         t3 = _mm_unpackhi_ps(m2, m3);
1424                         m0 = _mm_movelh_ps(t0, t1);
1425                         m1 = _mm_movehl_ps(t1, t0);
1426                         m2 = _mm_movelh_ps(t2, t3);
1427                         m3 = _mm_movehl_ps(t3, t2);                     
1428                 }
1429                 _mm_store_ps(command->val, m0);
1430                 _mm_store_ps(command->val+4, m1);
1431                 _mm_store_ps(command->val+8, m2);
1432                 _mm_store_ps(command->val+12, m3);
1433                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1434                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1435                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1436                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1437         }
1438 #endif
1439 }
1440
1441 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1442 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1443 {
1444         thread->uniform1i[command->index] = command->val;
1445 }
1446 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1447 {
1448         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1449         command->index = index;
1450         command->val = i0;
1451
1452         dpsoftrast.uniform1i[command->index] = i0;
1453 }
1454
1455 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1456 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1457 {
1458         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1459         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1460 }
1461 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1462 {
1463         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1464         command->clipplane[0] = x;
1465         command->clipplane[1] = y;
1466         command->clipplane[2] = z;
1467         command->clipplane[3] = w;
1468 }
1469
1470 #ifdef SSE_POSSIBLE
1471 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1472 {
1473         float *end = dst + size*4;
1474         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1475         {
1476                 while (dst < end)
1477                 {
1478                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1479                         dst += 4;
1480                         src += stride;
1481                 }
1482         }
1483         else
1484         {
1485                 while (dst < end)
1486                 {
1487                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1488                         dst += 4;
1489                         src += stride;
1490                 }
1491         }
1492 }
1493
1494 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1495 {
1496         float *end = dst + size*4;
1497         if (stride == sizeof(float[3]))
1498         {
1499                 float *end4 = dst + (size&~3)*4;        
1500                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1501                 {
1502                         while (dst < end4)
1503                         {
1504                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1505                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1506                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1507                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1508                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1509                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1510                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1512                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1513                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1514                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1515                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1516                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1517                                 dst += 16;
1518                                 src += 4*sizeof(float[3]);
1519                         }
1520                 }
1521                 else
1522                 {
1523                         while (dst < end4)
1524                         {
1525                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1526                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1527                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1528                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1529                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1530                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1531                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1533                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1534                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1535                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1536                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1537                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1538                                 dst += 16;
1539                                 src += 4*sizeof(float[3]);
1540                         }
1541                 }
1542         }
1543         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1544         {
1545                 while (dst < end)
1546                 {
1547                         __m128 v = _mm_loadu_ps((const float *)src);
1548                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1549                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1550                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1551                         _mm_store_ps(dst, v);
1552                         dst += 4;
1553                         src += stride;
1554                 }
1555         }
1556         else
1557         {
1558                 while (dst < end)
1559                 {
1560                         __m128 v = _mm_load_ps((const float *)src);
1561                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1562                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1563                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1564                         _mm_store_ps(dst, v);
1565                         dst += 4;
1566                         src += stride;
1567                 }
1568         }
1569 }
1570
1571 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1572 {
1573         float *end = dst + size*4;
1574         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1575         if (stride == sizeof(float[2]))
1576         {
1577                 float *end2 = dst + (size&~1)*4;
1578                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1579                 {
1580                         while (dst < end2)
1581                         {
1582                                 __m128 v = _mm_loadu_ps((const float *)src);
1583                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1584                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1585                                 dst += 8;
1586                                 src += 2*sizeof(float[2]);
1587                         }
1588                 }
1589                 else
1590                 {
1591                         while (dst < end2)
1592                         {
1593                                 __m128 v = _mm_load_ps((const float *)src);
1594                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1595                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1596                                 dst += 8;
1597                                 src += 2*sizeof(float[2]);
1598                         }
1599                 }
1600         }
1601         while (dst < end)
1602         {
1603                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1604                 dst += 4;
1605                 src += stride;
1606         }
1607 }
1608
1609 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1610 {
1611         float *end = dst + size*4;
1612         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1613         if (stride == sizeof(unsigned char[4]))
1614         {
1615                 float *end4 = dst + (size&~3)*4;
1616                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1617                 {
1618                         while (dst < end4)
1619                         {
1620                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1621                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1622                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1623                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1624                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1625                                 dst += 16;
1626                                 src += 4*sizeof(unsigned char[4]);
1627                         }
1628                 }
1629                 else
1630                 {
1631                         while (dst < end4)
1632                         {
1633                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1634                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1635                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1636                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1637                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1638                                 dst += 16;
1639                                 src += 4*sizeof(unsigned char[4]);
1640                         }
1641                 }
1642         }
1643         while (dst < end)
1644         {
1645                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1646                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1647                 dst += 4;
1648                 src += stride;
1649         }
1650 }
1651
1652 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1653 {
1654         float *end = dst + 4*size;
1655         __m128 v = _mm_loadu_ps(src);
1656         while (dst < end)
1657         {
1658                 _mm_store_ps(dst, v);
1659                 dst += 4;
1660         }
1661 }
1662 #endif
1663
1664 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1665 {
1666 #ifdef SSE_POSSIBLE
1667         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1668         __m128 m0, m1, m2, m3;
1669         float *end;
1670         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1671         {
1672                 // fast case for identity matrix
1673                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1674                 return;
1675         }
1676         end = out4f + numitems*4;
1677         m0 = _mm_loadu_ps(inmatrix16f);
1678         m1 = _mm_loadu_ps(inmatrix16f + 4);
1679         m2 = _mm_loadu_ps(inmatrix16f + 8);
1680         m3 = _mm_loadu_ps(inmatrix16f + 12);
1681         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1682         {
1683                 while (out4f < end)
1684                 {
1685                         __m128 v = _mm_loadu_ps(in4f);
1686                         _mm_store_ps(out4f,
1687                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1688                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1689                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1690                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1691                         out4f += 4;
1692                         in4f += 4;
1693                 }
1694         }
1695         else
1696         {
1697                 while (out4f < end)
1698                 {
1699                         __m128 v = _mm_load_ps(in4f);
1700                         _mm_store_ps(out4f,
1701                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1702                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1703                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1704                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1705                         out4f += 4;
1706                         in4f += 4;
1707                 }
1708         }
1709 #endif
1710 }
1711
1712 #if 0
1713 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1714 {
1715         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1716 }
1717 #endif
1718
1719 #ifdef SSE_POSSIBLE
1720 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1721 { \
1722         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1723         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1724         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1725         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1726 }
1727
1728 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1729 { \
1730         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1731         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1732         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1733         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1734 }
1735
1736 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1737 { \
1738         __m128 p = (in); \
1739         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1740                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1741                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1742                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1743 }
1744
1745 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1746 {
1747         int clipmask = 0xFF;
1748         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1749         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1750         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1751         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1752         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1753         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1754         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1755         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1756         #define BBFRONT(k, pos) \
1757         { \
1758                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1759                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1760                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1761                 { \
1762                         __m128 proj; \
1763                         clipmask &= ~(1<<k); \
1764                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1765                         minproj = _mm_min_ss(minproj, proj); \
1766                         maxproj = _mm_max_ss(maxproj, proj); \
1767                 } \
1768         }
1769         BBFRONT(0, minpos); 
1770         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1771         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1772         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1773         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1774         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1775         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1776         BBFRONT(7, maxpos);
1777         #define BBCLIP(k) \
1778         { \
1779                 if (clipmask&(1<<k)) \
1780                 { \
1781                         if (!(clipmask&(1<<(k^1)))) \
1782                         { \
1783                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1784                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1785                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1786                                 minproj = _mm_min_ss(minproj, proj); \
1787                                 maxproj = _mm_max_ss(maxproj, proj); \
1788                         } \
1789                         if (!(clipmask&(1<<(k^2)))) \
1790                         { \
1791                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1792                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1793                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1794                                 minproj = _mm_min_ss(minproj, proj); \
1795                                 maxproj = _mm_max_ss(maxproj, proj); \
1796                         } \
1797                         if (!(clipmask&(1<<(k^4)))) \
1798                         { \
1799                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1800                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1801                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1802                                 minproj = _mm_min_ss(minproj, proj); \
1803                                 maxproj = _mm_max_ss(maxproj, proj); \
1804                         } \
1805                 } \
1806         }
1807         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1808         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1809         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1810         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1811         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1812         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1813         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1814         *starty = _mm_cvttss_si32(maxproj);
1815         *endy = _mm_cvttss_si32(minproj)+1;
1816         return clipmask;
1817 }
1818         
1819 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1820 {
1821         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1822         float *end = out4f + numitems*4;
1823         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1824         __m128 minpos, maxpos;
1825         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1826         {
1827                 minpos = maxpos = _mm_loadu_ps(in4f);
1828                 while (out4f < end)
1829                 {
1830                         __m128 v = _mm_loadu_ps(in4f);
1831                         minpos = _mm_min_ps(minpos, v);
1832                         maxpos = _mm_max_ps(maxpos, v);
1833                         _mm_store_ps(out4f, v);
1834                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1835                         _mm_store_ps(screen4f, v);
1836                         in4f += 4;
1837                         out4f += 4;
1838                         screen4f += 4;
1839                 }
1840         }
1841         else
1842         {
1843                 minpos = maxpos = _mm_load_ps(in4f);
1844                 while (out4f < end)
1845                 {
1846                         __m128 v = _mm_load_ps(in4f);
1847                         minpos = _mm_min_ps(minpos, v);
1848                         maxpos = _mm_max_ps(maxpos, v);
1849                         _mm_store_ps(out4f, v);
1850                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1851                         _mm_store_ps(screen4f, v);
1852                         in4f += 4;
1853                         out4f += 4;
1854                         screen4f += 4;
1855                 }
1856         }
1857         if (starty && endy) 
1858         {
1859                 ALIGN(float minposf[4]);
1860                 ALIGN(float maxposf[4]);
1861                 _mm_store_ps(minposf, minpos);
1862                 _mm_store_ps(maxposf, maxpos);
1863                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1864         }
1865         return 0;
1866 }
1867
1868 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1869 {
1870         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1871         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1872         float *end;
1873         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1874                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1875         end = out4f + numitems*4;
1876         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1877         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1878         m0 = _mm_loadu_ps(inmatrix16f);
1879         m1 = _mm_loadu_ps(inmatrix16f + 4);
1880         m2 = _mm_loadu_ps(inmatrix16f + 8);
1881         m3 = _mm_loadu_ps(inmatrix16f + 12);
1882         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1883         {
1884                 minpos = maxpos = _mm_loadu_ps(in4f);
1885                 while (out4f < end)
1886                 {
1887                         __m128 v = _mm_loadu_ps(in4f);
1888                         minpos = _mm_min_ps(minpos, v);
1889                         maxpos = _mm_max_ps(maxpos, v);
1890                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1891                         _mm_store_ps(out4f, v);
1892                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1893                         _mm_store_ps(screen4f, v);
1894                         in4f += 4;
1895                         out4f += 4;
1896                         screen4f += 4;
1897                 }
1898         }
1899         else
1900         {
1901                 minpos = maxpos = _mm_load_ps(in4f);
1902                 while (out4f < end)
1903                 {
1904                         __m128 v = _mm_load_ps(in4f);
1905                         minpos = _mm_min_ps(minpos, v);
1906                         maxpos = _mm_max_ps(maxpos, v);
1907                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1908                         _mm_store_ps(out4f, v);
1909                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1910                         _mm_store_ps(screen4f, v);
1911                         in4f += 4;
1912                         out4f += 4;
1913                         screen4f += 4;
1914                 }
1915         }
1916         if (starty && endy) 
1917         {
1918                 ALIGN(float minposf[4]);
1919                 ALIGN(float maxposf[4]);
1920                 _mm_store_ps(minposf, minpos);
1921                 _mm_store_ps(maxposf, maxpos);
1922                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1923         }
1924         return 0;
1925 }
1926 #endif
1927
1928 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1929 {
1930 #ifdef SSE_POSSIBLE
1931         float *outf = dpsoftrast.post_array4f[outarray];
1932         const unsigned char *inb;
1933         int firstvertex = dpsoftrast.firstvertex;
1934         int numvertices = dpsoftrast.numvertices;
1935         int stride;
1936         switch(inarray)
1937         {
1938         case DPSOFTRAST_ARRAY_POSITION:
1939                 stride = dpsoftrast.stride_vertex;
1940                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1941                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1942                 break;
1943         case DPSOFTRAST_ARRAY_COLOR:
1944                 stride = dpsoftrast.stride_color;
1945                 if (dpsoftrast.pointer_color4f)
1946                 {
1947                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1948                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1949                 }
1950                 else if (dpsoftrast.pointer_color4ub)
1951                 {
1952                         stride = dpsoftrast.stride_color;
1953                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1954                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1955                 }
1956                 else
1957                 {
1958                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1959                 }
1960                 break;
1961         default:
1962                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1963                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1964                 {
1965                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1966                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1967                         {
1968                         case 2:
1969                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1970                                 break;
1971                         case 3:
1972                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1973                                 break;
1974                         case 4:
1975                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1976                                 break;
1977                         }
1978                 }
1979                 break;
1980         }
1981         return outf;
1982 #else
1983         return NULL;
1984 #endif
1985 }
1986
1987 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1988 {
1989         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1990         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1991         return data;
1992 }
1993
1994 #if 0
1995 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1996 {
1997 #ifdef SSE_POSSIBLE
1998         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1999         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2000         return data;
2001 #else
2002         return NULL;
2003 #endif
2004 }
2005 #endif
2006
2007 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2008 {
2009 #ifdef SSE_POSSIBLE
2010         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2011         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2012         return data;
2013 #else
2014         return NULL;
2015 #endif
2016 }
2017
2018 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2019 {
2020         int x;
2021         int startx = span->startx;
2022         int endx = span->endx;
2023         float wslope = triangle->w[0];
2024         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2025         float endz = 1.0f / (w + wslope * startx);
2026         if (triangle->w[0] == 0)
2027         {
2028                 // LordHavoc: fast flat polygons (HUD/menu)
2029                 for (x = startx;x < endx;x++)
2030                         zf[x] = endz;
2031                 return;
2032         }
2033         for (x = startx;x < endx;)
2034         {
2035                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2036                 float z = endz, dz;
2037                 if (nextsub >= endx) nextsub = endsub = endx-1;
2038                 endz = 1.0f / (w + wslope * nextsub);
2039                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2040                 for (; x <= endsub; x++, z += dz)
2041                         zf[x] = z;
2042         }
2043 }
2044
2045 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2046 {
2047 #ifdef SSE_POSSIBLE
2048         int x;
2049         int startx = span->startx;
2050         int endx = span->endx;
2051         int maskx;
2052         int subx;
2053         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2054         unsigned char * RESTRICT pixelmask = span->pixelmask;
2055         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2056         if (!pixeli)
2057                 return;
2058         pixeli += span->y * dpsoftrast.fb_width + span->x;
2059         // handle alphatest now (this affects depth writes too)
2060         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2061                 for (x = startx;x < endx;x++)
2062                         if (in4ub[x*4+3] < 128)
2063                                 pixelmask[x] = false;
2064         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2065         // helps sprites, text and hud artwork
2066         switch(thread->fb_blendmode)
2067         {
2068         case DPSOFTRAST_BLENDMODE_ALPHA:
2069         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2070         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2071                 maskx = startx;
2072                 for (x = startx;x < endx;x++)
2073                 {
2074                         if (in4ub[x*4+3] >= 1)
2075                         {
2076                                 startx = x;
2077                                 for (;;)
2078                                 {
2079                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2080                                         maskx = x;
2081                                         if (x >= endx) break;
2082                                         ++x;
2083                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2084                                         if (x >= endx) break;
2085                                 }
2086                                 break;
2087                         }
2088                 }
2089                 endx = maskx;
2090                 break;
2091         case DPSOFTRAST_BLENDMODE_OPAQUE:
2092         case DPSOFTRAST_BLENDMODE_ADD:
2093         case DPSOFTRAST_BLENDMODE_INVMOD:
2094         case DPSOFTRAST_BLENDMODE_MUL:
2095         case DPSOFTRAST_BLENDMODE_MUL2:
2096         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2097         case DPSOFTRAST_BLENDMODE_INVADD:
2098                 break;
2099         }
2100         // put some special values at the end of the mask to ensure the loops end
2101         pixelmask[endx] = 1;
2102         pixelmask[endx+1] = 0;
2103         // LordHavoc: use a double loop to identify subspans, this helps the
2104         // optimized copy/blend loops to perform at their best, most triangles
2105         // have only one run of pixels, and do the search using wide reads...
2106         x = startx;
2107         while (x < endx)
2108         {
2109                 // if this pixel is masked off, it's probably not alone...
2110                 if (!pixelmask[x])
2111                 {
2112                         x++;
2113 #if 1
2114                         if (x + 8 < endx)
2115                         {
2116                                 // the 4-item search must be aligned or else it stalls badly
2117                                 if ((x & 3) && !pixelmask[x]) 
2118                                 {
2119                                         if(pixelmask[x]) goto endmasked;
2120                                         x++;
2121                                         if (x & 3)
2122                                         {
2123                                                 if(pixelmask[x]) goto endmasked;
2124                                                 x++;
2125                                                 if (x & 3)
2126                                                 {
2127                                                         if(pixelmask[x]) goto endmasked;
2128                                                         x++;
2129                                                 }
2130                                         }
2131                                 }
2132                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2133                                         x += 4;
2134                         }
2135 #endif
2136                         for (;!pixelmask[x];x++)
2137                                 ;
2138                         // rather than continue the loop, just check the end variable
2139                         if (x >= endx)
2140                                 break;
2141                 }
2142         endmasked:
2143                 // find length of subspan
2144                 subx = x + 1;
2145 #if 1
2146                 if (subx + 8 < endx)
2147                 {
2148                         if (subx & 3)
2149                         {
2150                                 if(!pixelmask[subx]) goto endunmasked;
2151                                 subx++;
2152                                 if (subx & 3)
2153                                 {
2154                                         if(!pixelmask[subx]) goto endunmasked;
2155                                         subx++;
2156                                         if (subx & 3)
2157                                         {
2158                                                 if(!pixelmask[subx]) goto endunmasked;
2159                                                 subx++;
2160                                         }
2161                                 }
2162                         }
2163                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2164                                 subx += 4;
2165                 }
2166 #endif
2167                 for (;pixelmask[subx];subx++)
2168                         ;
2169                 // the checks can overshoot, so make sure to clip it...
2170                 if (subx > endx)
2171                         subx = endx;
2172         endunmasked:
2173                 // now that we know the subspan length...  process!
2174                 switch(thread->fb_blendmode)
2175                 {
2176                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2177 #if 0
2178                         if (subx - x >= 16)
2179                         {
2180                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2181                                 x = subx;
2182                         }
2183                         else
2184 #elif 1
2185                         while (x + 16 <= subx)
2186                         {
2187                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2188                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2189                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2190                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2191                                 x += 16;
2192                         }
2193 #endif
2194                         {
2195                                 while (x + 4 <= subx)
2196                                 {
2197                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2198                                         x += 4;
2199                                 }
2200                                 if (x + 2 <= subx)
2201                                 {
2202                                         pixeli[x] = ini[x];
2203                                         pixeli[x+1] = ini[x+1];
2204                                         x += 2;
2205                                 }
2206                                 if (x < subx)
2207                                 {
2208                                         pixeli[x] = ini[x];
2209                                         x++;
2210                                 }
2211                         }
2212                         break;
2213                 case DPSOFTRAST_BLENDMODE_ALPHA:
2214                 #define FINISHBLEND(blend2, blend1) \
2215                         for (;x + 1 < subx;x += 2) \
2216                         { \
2217                                 __m128i src, dst; \
2218                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2219                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2220                                 blend2; \
2221                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2222                         } \
2223                         if (x < subx) \
2224                         { \
2225                                 __m128i src, dst; \
2226                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2227                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2228                                 blend1; \
2229                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2230                                 x++; \
2231                         }
2232                         FINISHBLEND({
2233                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2234                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2235                         }, {
2236                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2237                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2238                         });
2239                         break;
2240                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2241                         FINISHBLEND({
2242                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2243                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2244                         }, {
2245                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2246                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2247                         });
2248                         break;
2249                 case DPSOFTRAST_BLENDMODE_ADD:
2250                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2251                         break;
2252                 case DPSOFTRAST_BLENDMODE_INVMOD:
2253                         FINISHBLEND({
2254                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2255                         }, {
2256                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2257                         });
2258                         break;
2259                 case DPSOFTRAST_BLENDMODE_MUL:
2260                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2261                         break;
2262                 case DPSOFTRAST_BLENDMODE_MUL2:
2263                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2264                         break;
2265                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2266                         FINISHBLEND({
2267                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2268                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2269                         }, {
2270                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2271                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2272                         });
2273                         break;
2274                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2275                         FINISHBLEND({
2276                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2277                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2278                         }, {
2279                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2280                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2281                         });
2282                         break;
2283                 case DPSOFTRAST_BLENDMODE_INVADD:
2284                         FINISHBLEND({
2285                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2286                         }, {
2287                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2288                         });
2289                         break;
2290                 }
2291         }
2292 #endif
2293 }
2294
2295 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2296         // warning: this is SLOW, only use if the optimized per-span functions won't do
2297 {
2298         const unsigned char * RESTRICT pixelbase;
2299         const unsigned char * RESTRICT pixel[4];
2300         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2301         int wrapmask[2] = { width-1, height-1 };
2302         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2303         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2304         {
2305                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2306                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2307                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2308                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2309                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2310                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2311                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2312                 {
2313                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2314                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2315                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2316                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2317                 }
2318                 else
2319                 {
2320                         tci[0] &= wrapmask[0];
2321                         tci[1] &= wrapmask[1];
2322                         tci1[0] &= wrapmask[0];
2323                         tci1[1] &= wrapmask[1];
2324                 }
2325                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2326                 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2327                 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2328                 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2329                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2330                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2331                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2332                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2333         }
2334         else
2335         {
2336                 int tci[2] = { x * width, y * height };
2337                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2338                 {
2339                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2340                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2341                 }
2342                 else
2343                 {
2344                         tci[0] &= wrapmask[0];
2345                         tci[1] &= wrapmask[1];
2346                 }
2347                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2348                 c[0] = pixel[0][0];
2349                 c[1] = pixel[0][1];
2350                 c[2] = pixel[0][2];
2351                 c[3] = pixel[0][3];
2352         }
2353 }
2354
2355 #if 0
2356 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2357 {
2358         int x;
2359         int startx = span->startx;
2360         int endx = span->endx;
2361         int flags;
2362         float c[4];
2363         float data[4];
2364         float slope[4];
2365         float tc[2], endtc[2];
2366         float tcscale[2];
2367         unsigned int tci[2];
2368         unsigned int tci1[2];
2369         unsigned int tcimin[2];
2370         unsigned int tcimax[2];
2371         int tciwrapmask[2];
2372         int tciwidth;
2373         int filter;
2374         int mip;
2375         const unsigned char * RESTRICT pixelbase;
2376         const unsigned char * RESTRICT pixel[4];
2377         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2378         // if no texture is bound, just fill it with white
2379         if (!texture)
2380         {
2381                 for (x = startx;x < endx;x++)
2382                 {
2383                         out4f[x*4+0] = 1.0f;
2384                         out4f[x*4+1] = 1.0f;
2385                         out4f[x*4+2] = 1.0f;
2386                         out4f[x*4+3] = 1.0f;
2387                 }
2388                 return;
2389         }
2390         mip = triangle->mip[texunitindex];
2391         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2392         // if this mipmap of the texture is 1 pixel, just fill it with that color
2393         if (texture->mipmap[mip][1] == 4)
2394         {
2395                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2396                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2397                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2398                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2399                 for (x = startx;x < endx;x++)
2400                 {
2401                         out4f[x*4+0] = c[0];
2402                         out4f[x*4+1] = c[1];
2403                         out4f[x*4+2] = c[2];
2404                         out4f[x*4+3] = c[3];
2405                 }
2406                 return;
2407         }
2408         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2409         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2410         flags = texture->flags;
2411         tcscale[0] = texture->mipmap[mip][2];
2412         tcscale[1] = texture->mipmap[mip][3];
2413         tciwidth = -texture->mipmap[mip][2];
2414         tcimin[0] = 0;
2415         tcimin[1] = 0;
2416         tcimax[0] = texture->mipmap[mip][2]-1;
2417         tcimax[1] = texture->mipmap[mip][3]-1;
2418         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2419         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2420         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2421         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2422         if (filter)
2423         {
2424                 endtc[0] -= 0.5f;
2425                 endtc[1] -= 0.5f;
2426         }
2427         for (x = startx;x < endx;)
2428         {
2429                 unsigned int subtc[2];
2430                 unsigned int substep[2];
2431                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2432                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2433                 if (nextsub >= endx)
2434                 {
2435                         nextsub = endsub = endx-1;      
2436                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2437                 }
2438                 tc[0] = endtc[0];
2439                 tc[1] = endtc[1];
2440                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2441                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2442                 if (filter)
2443                 {
2444                         endtc[0] -= 0.5f;
2445                         endtc[1] -= 0.5f;
2446                 }
2447                 substep[0] = (endtc[0] - tc[0]) * subscale;
2448                 substep[1] = (endtc[1] - tc[1]) * subscale;
2449                 subtc[0] = tc[0] * (1<<12);
2450                 subtc[1] = tc[1] * (1<<12);
2451                 if (filter)
2452                 {
2453                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2454                         {
2455                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2456                                 {
2457                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2458                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2459                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2460                                         tci[0] = subtc[0]>>12;
2461                                         tci[1] = subtc[1]>>12;
2462                                         tci1[0] = tci[0] + 1;
2463                                         tci1[1] = tci[1] + 1;
2464                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2465                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2466                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2467                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2468                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2469                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2470                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2471                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2472                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2473                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2474                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2475                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2476                                         out4f[x*4+0] = c[0];
2477                                         out4f[x*4+1] = c[1];
2478                                         out4f[x*4+2] = c[2];
2479                                         out4f[x*4+3] = c[3];
2480                                 }
2481                         }
2482                         else
2483                         {
2484                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2485                                 {
2486                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2487                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2488                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2489                                         tci[0] = subtc[0]>>12;
2490                                         tci[1] = subtc[1]>>12;
2491                                         tci1[0] = tci[0] + 1;
2492                                         tci1[1] = tci[1] + 1;
2493                                         tci[0] &= tciwrapmask[0];
2494                                         tci[1] &= tciwrapmask[1];
2495                                         tci1[0] &= tciwrapmask[0];
2496                                         tci1[1] &= tciwrapmask[1];
2497                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2498                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2499                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2500                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2501                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2502                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2503                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2504                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2505                                         out4f[x*4+0] = c[0];
2506                                         out4f[x*4+1] = c[1];
2507                                         out4f[x*4+2] = c[2];
2508                                         out4f[x*4+3] = c[3];
2509                                 }
2510                         }
2511                 }
2512                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2513                 {
2514                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2515                         {
2516                                 tci[0] = subtc[0]>>12;
2517                                 tci[1] = subtc[1]>>12;
2518                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2519                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2520                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2521                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2522                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2523                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2524                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2525                                 out4f[x*4+0] = c[0];
2526                                 out4f[x*4+1] = c[1];
2527                                 out4f[x*4+2] = c[2];
2528                                 out4f[x*4+3] = c[3];
2529                         }
2530                 }
2531                 else
2532                 {
2533                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2534                         {
2535                                 tci[0] = subtc[0]>>12;
2536                                 tci[1] = subtc[1]>>12;
2537                                 tci[0] &= tciwrapmask[0];
2538                                 tci[1] &= tciwrapmask[1];
2539                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2540                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2541                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2542                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2543                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2544                                 out4f[x*4+0] = c[0];
2545                                 out4f[x*4+1] = c[1];
2546                                 out4f[x*4+2] = c[2];
2547                                 out4f[x*4+3] = c[3];
2548                         }
2549                 }
2550         }
2551 }
2552 #endif
2553
2554 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2555 {
2556 #ifdef SSE_POSSIBLE
2557         int x;
2558         int startx = span->startx;
2559         int endx = span->endx;
2560         int flags;
2561         __m128 data, slope, tcscale;
2562         __m128i tcsize, tcmask, tcoffset, tcmax;
2563         __m128 tc, endtc;
2564         __m128i subtc, substep, endsubtc;
2565         int filter;
2566         int mip;
2567         int affine; // LordHavoc: optimized affine texturing case
2568         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2569         const unsigned char * RESTRICT pixelbase;
2570         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2571         // if no texture is bound, just fill it with white
2572         if (!texture)
2573         {
2574                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2575                 return;
2576         }
2577         mip = triangle->mip[texunitindex];
2578         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2579         // if this mipmap of the texture is 1 pixel, just fill it with that color
2580         if (texture->mipmap[mip][1] == 4)
2581         {
2582                 unsigned int k = *((const unsigned int *)pixelbase);
2583                 for (x = startx;x < endx;x++)
2584                         outi[x] = k;
2585                 return;
2586         }
2587         affine = zf[startx] == zf[endx-1];
2588         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2589         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2590         flags = texture->flags;
2591         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2592         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2593         tcscale = _mm_cvtepi32_ps(tcsize);
2594         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2595         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2596         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2597         if (filter)
2598                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2599         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2600         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2601         tcmax = _mm_packs_epi32(tcmask, tcmask);
2602         for (x = startx;x < endx;)
2603         {
2604                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2605                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2606                 if (nextsub >= endx || affine)
2607                 {
2608                         nextsub = endsub = endx-1;
2609                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2610                 }       
2611                 tc = endtc;
2612                 subtc = endsubtc;
2613                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2614                 if (filter)
2615                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2616                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2617                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2618                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2619                 substep = _mm_slli_epi32(substep, 1);
2620                 if (filter)
2621                 {
2622                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2623                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2624                         {
2625                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2626                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2627                                 {
2628                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2629                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2630                                         tci = _mm_madd_epi16(tci, tcoffset);
2631                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2632                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2633                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2634                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2635                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2636                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2637                                         fracm = _mm_srli_epi16(subtc, 1);
2638                                         pix1 = _mm_add_epi16(pix1,
2639                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2640                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2641                                         pix3 = _mm_add_epi16(pix3,
2642                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2643                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2644                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2645                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2646                                         pix2 = _mm_add_epi16(pix2,
2647                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2648                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2649                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2650                                 }
2651                                 if (x <= endsub)
2652                                 {
2653                                         const unsigned char * RESTRICT ptr1;
2654                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2655                                         tci = _mm_madd_epi16(tci, tcoffset);
2656                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2657                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2658                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2659                                         fracm = _mm_srli_epi16(subtc, 1);
2660                                         pix1 = _mm_add_epi16(pix1,
2661                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2662                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2663                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2664                                         pix1 = _mm_add_epi16(pix1,
2665                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2666                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2667                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2668                                         x++;
2669                                 }
2670                         }
2671                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2672                         {
2673                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2674                                 {
2675                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2676                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2677                                         tci = _mm_madd_epi16(tci, tcoffset);
2678                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2679                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2680                                                                                         _mm_setzero_si128());
2681                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2682                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2683                                                                                         _mm_setzero_si128());
2684                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2685                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2686                                         tci = _mm_madd_epi16(tci, tcoffset);
2687                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2688                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2689                                                                                         _mm_setzero_si128());
2690                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2691                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2692                                                                                         _mm_setzero_si128());
2693                                         fracm = _mm_srli_epi16(subtc, 1);
2694                                         pix1 = _mm_add_epi16(pix1,
2695                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2696                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2697                                         pix3 = _mm_add_epi16(pix3,
2698                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2699                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2700                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2701                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2702                                         pix2 = _mm_add_epi16(pix2,
2703                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2704                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2705                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2706                                 }
2707                                 if (x <= endsub)
2708                                 {
2709                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2710                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2711                                         tci = _mm_madd_epi16(tci, tcoffset);
2712                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2713                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2714                                                                                         _mm_setzero_si128());
2715                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2716                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2717                                                                                         _mm_setzero_si128());
2718                                         fracm = _mm_srli_epi16(subtc, 1);
2719                                         pix1 = _mm_add_epi16(pix1,
2720                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2721                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2722                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2723                                         pix1 = _mm_add_epi16(pix1,
2724                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2725                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2726                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2727                                         x++;
2728                                 }
2729                         }
2730                         else
2731                         {
2732                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2733                                 {
2734                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2735                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2736                                         tci = _mm_madd_epi16(tci, tcoffset);
2737                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2738                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2739                                                                                         _mm_setzero_si128());
2740                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2741                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2742                                                                                         _mm_setzero_si128());
2743                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2744                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2745                                         tci = _mm_madd_epi16(tci, tcoffset);
2746                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2747                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2748                                                                                         _mm_setzero_si128());
2749                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2750                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2751                                                                                         _mm_setzero_si128());
2752                                         fracm = _mm_srli_epi16(subtc, 1);
2753                                         pix1 = _mm_add_epi16(pix1,
2754                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2755                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2756                                         pix3 = _mm_add_epi16(pix3,
2757                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2758                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2759                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2760                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2761                                         pix2 = _mm_add_epi16(pix2,
2762                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2763                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2764                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2765                                 }
2766                                 if (x <= endsub)
2767                                 {
2768                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2769                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2770                                         tci = _mm_madd_epi16(tci, tcoffset);
2771                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2772                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2773                                                                                         _mm_setzero_si128());
2774                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2775                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2776                                                                                         _mm_setzero_si128());
2777                                         fracm = _mm_srli_epi16(subtc, 1);
2778                                         pix1 = _mm_add_epi16(pix1,
2779                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2780                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2781                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2782                                         pix1 = _mm_add_epi16(pix1,
2783                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2784                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2785                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2786                                         x++;
2787                                 }
2788                         }
2789                 }
2790                 else
2791                 {
2792                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2793                         {
2794                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2795                                 {
2796                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2797                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2798                                         tci = _mm_madd_epi16(tci, tcoffset);
2799                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2800                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2801                                 }
2802                                 if (x <= endsub)
2803                                 {
2804                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2805                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2806                                         tci = _mm_madd_epi16(tci, tcoffset);
2807                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2808                                         x++;
2809                                 }
2810                         }
2811                         else
2812                         {
2813                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2814                                 {
2815                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2816                                         tci = _mm_and_si128(tci, tcmax); 
2817                                         tci = _mm_madd_epi16(tci, tcoffset);
2818                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2819                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2820                                 }
2821                                 if (x <= endsub)
2822                                 {
2823                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2824                                         tci = _mm_and_si128(tci, tcmax); 
2825                                         tci = _mm_madd_epi16(tci, tcoffset);
2826                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2827                                         x++;
2828                                 }
2829                         }
2830                 }
2831         }
2832 #endif
2833 }
2834
2835 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2836 {
2837         // TODO: IMPLEMENT
2838         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2839 }
2840
2841 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2842 {
2843         // TODO: IMPLEMENT
2844         return 1.0f;
2845 }
2846
2847 #if 0
2848 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2849 {
2850         int x;
2851         int startx = span->startx;
2852         int endx = span->endx;
2853         float c[4];
2854         float data[4];
2855         float slope[4];
2856         float z;
2857         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2858         for (x = startx;x < endx;x++)
2859         {
2860                 z = zf[x];
2861                 c[0] = (data[0] + slope[0]*x) * z;
2862                 c[1] = (data[1] + slope[1]*x) * z;
2863                 c[2] = (data[2] + slope[2]*x) * z;
2864                 c[3] = (data[3] + slope[3]*x) * z;
2865                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2866                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2867                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2868                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2869         }
2870 }
2871 #endif
2872
2873 #if 0
2874 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2875 {
2876         int x;
2877         int startx = span->startx;
2878         int endx = span->endx;
2879         float c[4];
2880         float data[4];
2881         float slope[4];
2882         float z;
2883         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2884         for (x = startx;x < endx;x++)
2885         {
2886                 z = zf[x];
2887                 c[0] = (data[0] + slope[0]*x) * z;
2888                 c[1] = (data[1] + slope[1]*x) * z;
2889                 c[2] = (data[2] + slope[2]*x) * z;
2890                 c[3] = (data[3] + slope[3]*x) * z;
2891                 out4f[x*4+0] = c[0];
2892                 out4f[x*4+1] = c[1];
2893                 out4f[x*4+2] = c[2];
2894                 out4f[x*4+3] = c[3];
2895         }
2896 }
2897 #endif
2898
2899 #if 0
2900 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2901 {
2902         int x, startx = span->startx, endx = span->endx;
2903         float c[4], localcolor[4];
2904         localcolor[0] = subcolor[0];
2905         localcolor[1] = subcolor[1];
2906         localcolor[2] = subcolor[2];
2907         localcolor[3] = subcolor[3];
2908         for (x = startx;x < endx;x++)
2909         {
2910                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2911                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2912                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2913                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2914                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2915                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2916                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2917                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2918         }
2919 }
2920 #endif
2921
2922 #if 0
2923 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2924 {
2925         int x, startx = span->startx, endx = span->endx;
2926         for (x = startx;x < endx;x++)
2927         {
2928                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2929                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2930                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2931                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2932         }
2933 }
2934 #endif
2935
2936 #if 0
2937 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2938 {
2939         int x, startx = span->startx, endx = span->endx;
2940         for (x = startx;x < endx;x++)
2941         {
2942                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2943                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2944                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2945                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2946         }
2947 }
2948 #endif
2949
2950 #if 0
2951 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2952 {
2953         int x, startx = span->startx, endx = span->endx;
2954         float a, b;
2955         for (x = startx;x < endx;x++)
2956         {
2957                 a = 1.0f - inb4f[x*4+3];
2958                 b = inb4f[x*4+3];
2959                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2960                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2961                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2962                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2963         }
2964 }
2965 #endif
2966
2967 #if 0
2968 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2969 {
2970         int x, startx = span->startx, endx = span->endx;
2971         float localcolor[4], ilerp, lerp;
2972         localcolor[0] = color[0];
2973         localcolor[1] = color[1];
2974         localcolor[2] = color[2];
2975         localcolor[3] = color[3];
2976         ilerp = 1.0f - localcolor[3];
2977         lerp = localcolor[3];
2978         for (x = startx;x < endx;x++)
2979         {
2980                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2981                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2982                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2983                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2984         }
2985 }
2986 #endif
2987
2988
2989
2990 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2991 {
2992 #ifdef SSE_POSSIBLE
2993         int x;
2994         int startx = span->startx;
2995         int endx = span->endx;
2996         __m128 data, slope;
2997         __m128 mod, endmod;
2998         __m128i submod, substep, endsubmod;
2999         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3000         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3001         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3002         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3003         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3004         for (x = startx; x < endx;)
3005         {
3006                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3007                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3008                 if (nextsub >= endx)
3009                 {
3010                         nextsub = endsub = endx-1;
3011                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3012                 }
3013                 mod = endmod;
3014                 submod = endsubmod;
3015                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3016                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3017                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3018                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3019                 substep = _mm_packs_epi32(substep, substep);
3020                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3021                 {
3022                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3023                         pix = _mm_mulhi_epu16(pix, submod);
3024                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3025                 }
3026                 if (x <= endsub)
3027                 {
3028                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3029                         pix = _mm_mulhi_epu16(pix, submod);
3030                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3031                         x++;
3032                 }
3033         }
3034 #endif
3035 }
3036
3037 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3038 {
3039 #ifdef SSE_POSSIBLE
3040         int x;
3041         int startx = span->startx;
3042         int endx = span->endx;
3043         __m128 data, slope;
3044         __m128 mod, endmod;
3045         __m128i submod, substep, endsubmod;
3046         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3047         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3048         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3049         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3050         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3051         for (x = startx; x < endx;)
3052         {
3053                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3054                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3055                 if (nextsub >= endx)
3056                 {
3057                         nextsub = endsub = endx-1;
3058                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3059                 }
3060                 mod = endmod;
3061                 submod = endsubmod;
3062                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3063                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3064                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3065                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3066                 substep = _mm_packs_epi32(substep, substep);
3067                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3068                 {
3069                         __m128i pix = _mm_srai_epi16(submod, 4);
3070                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3071                 }
3072                 if (x <= endsub)
3073                 {
3074                         __m128i pix = _mm_srai_epi16(submod, 4);
3075                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3076                         x++;
3077                 }
3078         }
3079 #endif
3080 }
3081
3082 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3083 {
3084 #ifdef SSE_POSSIBLE
3085         int x, startx = span->startx, endx = span->endx;
3086         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3087         localcolor = _mm_packs_epi32(localcolor, localcolor);
3088         for (x = startx;x+2 <= endx;x+=2)
3089         {
3090                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3091                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3092                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3093                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3094         }
3095         if (x < endx)
3096         {
3097                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3098                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3099                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3100                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3101         }
3102 #endif
3103 }
3104
3105 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3106 {
3107 #ifdef SSE_POSSIBLE
3108         int x, startx = span->startx, endx = span->endx;
3109         for (x = startx;x+2 <= endx;x+=2)
3110         {
3111                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3112                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3113                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3114                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3115         }
3116         if (x < endx)
3117         {
3118                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3119                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3120                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3121                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3122         }
3123 #endif
3124 }
3125
3126 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3127 {
3128 #ifdef SSE_POSSIBLE
3129         int x, startx = span->startx, endx = span->endx;
3130         for (x = startx;x+2 <= endx;x+=2)
3131         {
3132                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3133                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3134                 pix1 = _mm_add_epi16(pix1, pix2);
3135                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3136         }
3137         if (x < endx)
3138         {
3139                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3140                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3141                 pix1 = _mm_add_epi16(pix1, pix2);
3142                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3143         }
3144 #endif
3145 }
3146
3147 #if 0
3148 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3149 {
3150 #ifdef SSE_POSSIBLE
3151         int x, startx = span->startx, endx = span->endx;
3152         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3153         tint = _mm_packs_epi32(tint, tint);
3154         for (x = startx;x+2 <= endx;x+=2)
3155         {
3156                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3157                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3158                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3159                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3160         }
3161         if (x < endx)
3162         {
3163                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3164                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3165                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3166                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3167         }
3168 #endif
3169 }
3170 #endif
3171
3172 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3173 {
3174 #ifdef SSE_POSSIBLE
3175         int x, startx = span->startx, endx = span->endx;
3176         for (x = startx;x+2 <= endx;x+=2)
3177         {
3178                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3179                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3180                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3181                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3182                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3183         }
3184         if (x < endx)
3185         {
3186                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3187                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3188                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3189                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3190                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3191         }
3192 #endif
3193 }
3194
3195 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3196 {
3197 #ifdef SSE_POSSIBLE
3198         int x, startx = span->startx, endx = span->endx;
3199         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3200         localcolor = _mm_packs_epi32(localcolor, localcolor);
3201         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3202         for (x = startx;x+2 <= endx;x+=2)
3203         {
3204                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3205                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3206                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3207         }
3208         if (x < endx)
3209         {
3210                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3211                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3212                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3213         }
3214 #endif
3215 }
3216
3217
3218
3219 static void DPSOFTRAST_VertexShader_Generic(void)
3220 {
3221         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3222         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3223         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3224         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3225                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3226 }
3227
3228 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3229 {
3230         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3231         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3232         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3233         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3234         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3235         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3236         {
3237                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3238                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3239                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3240                 {
3241                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3242                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3243                         {
3244                                 // multiply
3245                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3246                         }
3247                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3248                         {
3249                                 // add
3250                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3251                         }
3252                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3253                         {
3254                                 // alphablend
3255                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3256                         }
3257                 }
3258         }
3259         else
3260                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3261         if(thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
3262         {
3263                 int x;
3264                 for (x = span->startx;x < span->endx;x++)
3265                         buffer_FragColorbgra8[x*4+3] = buffer_FragColorbgra8[x*4+3] * thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3266         }
3267         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3268 }
3269
3270
3271
3272 static void DPSOFTRAST_VertexShader_PostProcess(void)
3273 {
3274         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3275         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3276         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3277 }
3278
3279 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3280 {
3281         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3282         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3283         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3285         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3286         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3287         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3288         {
3289                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3290                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3291         }
3292         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3293         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3294         {
3295                 // TODO: implement saturation
3296         }
3297         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3298         {
3299                 // TODO: implement gammaramps
3300         }
3301         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3302 }
3303
3304
3305
3306 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3307 {
3308         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3309 }
3310
3311 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3312 {
3313         // this is never called (because colormask is off when this shader is used)
3314         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3315         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3316         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3317         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3318         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3319 }
3320
3321
3322
3323 static void DPSOFTRAST_VertexShader_FlatColor(void)
3324 {
3325         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3326         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3327 }
3328
3329 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3330 {
3331 #ifdef SSE_POSSIBLE
3332         unsigned char * RESTRICT pixelmask = span->pixelmask;
3333         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3334         int x, startx = span->startx, endx = span->endx;
3335         __m128i Color_Ambientm;
3336         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3337         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3338         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3339         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3340         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3341         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3342                 pixel = buffer_FragColorbgra8;
3343         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3344         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3345         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3346         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3347         for (x = startx;x < endx;x++)
3348         {
3349                 __m128i color, pix;
3350                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3351                 {
3352                         __m128i pix2;
3353                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3354                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3355                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3356                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3357                         x += 3;
3358                         continue;
3359                 }
3360                 if (!pixelmask[x])
3361                         continue;
3362                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3363                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3364                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3365         }
3366         if (pixel == buffer_FragColorbgra8)
3367                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3368 #endif
3369 }
3370
3371
3372
3373 static void DPSOFTRAST_VertexShader_VertexColor(void)
3374 {
3375         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3376         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3377         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3378 }
3379
3380 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3381 {
3382 #ifdef SSE_POSSIBLE
3383         unsigned char * RESTRICT pixelmask = span->pixelmask;
3384         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3385         int x, startx = span->startx, endx = span->endx;
3386         __m128i Color_Ambientm, Color_Diffusem;
3387         __m128 data, slope;
3388         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3389         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3390         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3391         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3392         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3393         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3394         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3395                 pixel = buffer_FragColorbgra8;
3396         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3397         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3398         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3399         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3400         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3401         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3402         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3403         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3404         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3405         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3406         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3407         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3408         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3409         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3410         {
3411                 __m128i color, mod, pix;
3412                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3413                 {
3414                         __m128i pix2, mod2;
3415                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3416                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3417                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3418                         data = _mm_add_ps(data, slope);
3419                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3420                         data = _mm_add_ps(data, slope);
3421                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3422                         data = _mm_add_ps(data, slope);
3423                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3424                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3425                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3426                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3427                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3428                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3429                         x += 3;
3430                         continue;
3431                 }
3432                 if (!pixelmask[x])
3433                         continue;
3434                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3435                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3436                 mod = _mm_packs_epi32(mod, mod);
3437                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3438                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3439         }
3440         if (pixel == buffer_FragColorbgra8)
3441                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3442 #endif
3443 }
3444
3445
3446
3447 static void DPSOFTRAST_VertexShader_Lightmap(void)
3448 {
3449         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3450         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3451         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3452 }
3453
3454 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3455 {
3456 #ifdef SSE_POSSIBLE
3457         unsigned char * RESTRICT pixelmask = span->pixelmask;
3458         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3459         int x, startx = span->startx, endx = span->endx;
3460         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3461         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3462         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3463         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3464         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3465         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3466         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3467         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3468         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3469         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3470                 pixel = buffer_FragColorbgra8;
3471         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3472         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3473         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3474         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3475         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3476         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3477         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3478         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3479         {
3480                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3481                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3482                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3483                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3484                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3485                 for (x = startx;x < endx;x++)
3486                 {
3487                         __m128i color, lightmap, glow, pix;
3488                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3489                         {
3490                                 __m128i pix2;
3491                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3492                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3493                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3494                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3495                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3496                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3497                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3498                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3499                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3500                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3501                                 x += 3;
3502                                 continue;
3503                         }
3504                         if (!pixelmask[x])
3505                                 continue;
3506                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3507                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3508                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3509                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3510                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3511                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3512                 }
3513         }
3514         else
3515         {
3516                 for (x = startx;x < endx;x++)
3517                 {
3518                         __m128i color, lightmap, pix;
3519                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3520                         {
3521                                 __m128i pix2;
3522                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3523                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3524                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3525                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3526                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3527                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3528                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3529                                 x += 3;
3530                                 continue;
3531                         }
3532                         if (!pixelmask[x]) 
3533                                 continue;
3534                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3535                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3536                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3537                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3538                 }
3539         }
3540         if (pixel == buffer_FragColorbgra8)
3541                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3542 #endif
3543 }
3544
3545
3546 void DPSOFTRAST_VertexShader_LightDirection(void);
3547 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3548
3549 static void DPSOFTRAST_VertexShader_FakeLight(void)
3550 {
3551         DPSOFTRAST_VertexShader_LightDirection();
3552 }
3553
3554 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3555 {
3556         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3557 }
3558
3559
3560
3561 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3562 {
3563         DPSOFTRAST_VertexShader_LightDirection();
3564         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3565 }
3566
3567 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3568 {
3569         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3570 }
3571
3572
3573
3574 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3575 {
3576         DPSOFTRAST_VertexShader_LightDirection();
3577         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3578 }
3579
3580 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3581 {
3582         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3583 }
3584
3585
3586
3587 void DPSOFTRAST_VertexShader_LightDirection(void)
3588 {
3589         int i;
3590         int numvertices = dpsoftrast.numvertices;
3591         float LightDir[4];
3592         float LightVector[4];
3593         float EyePosition[4];
3594         float EyeVectorModelSpace[4];
3595         float EyeVector[4];
3596         float position[4];
3597         float svector[4];
3598         float tvector[4];
3599         float normal[4];
3600         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3601         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3602         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3603         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3604         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3605         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3606         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3607         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3608         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3609         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3610         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3611         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3612         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3613         for (i = 0;i < numvertices;i++)
3614         {
3615                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3616                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3617                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3618                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3619                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3620                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3621                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3622                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3623                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3624                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3625                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3626                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3627                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3628                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3629                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3630                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3631                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3632                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3633                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3634                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3635                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3636                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3637                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3638                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3639                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3640                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3641                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3642                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3643                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3644         }
3645         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3646 }
3647
3648 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3649 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3650 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3651 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3652 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3653 #define DPSOFTRAST_Vector3Normalize(v)\
3654 do\
3655 {\
3656         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3657         if (len)\
3658         {\
3659                 len = 1.0f / len;\
3660                 v[0] *= len;\
3661                 v[1] *= len;\
3662                 v[2] *= len;\
3663         }\
3664 }\
3665 while(0)
3666
3667 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3668 {
3669         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3670         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3671         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3672         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3673         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3674         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3675         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3676         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3677         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3678         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3679         int x, startx = span->startx, endx = span->endx;
3680         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3681         float LightVectordata[4];
3682         float LightVectorslope[4];
3683         float EyeVectordata[4];
3684         float EyeVectorslope[4];
3685         float VectorSdata[4];
3686         float VectorSslope[4];
3687         float VectorTdata[4];
3688         float VectorTslope[4];
3689         float VectorRdata[4];
3690         float VectorRslope[4];
3691         float z;
3692         float diffusetex[4];
3693         float glosstex[4];
3694         float surfacenormal[4];
3695         float lightnormal[4];
3696         float lightnormal_modelspace[4];
3697         float eyenormal[4];
3698         float specularnormal[4];
3699         float diffuse;
3700         float specular;
3701         float SpecularPower;
3702         int d[4];
3703         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3704         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3705         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3706         Color_Glow[3] = 0.0f;
3707         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3708         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3709         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3710         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3711         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3712         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3713         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3714         Color_Pants[3] = 0.0f;
3715         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3716         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3717         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3718         Color_Shirt[3] = 0.0f;
3719         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3720         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3721         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3722         {
3723                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3724                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3725         }
3726         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3727         {
3728                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3729         }
3730         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3731         {
3732                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3733                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3734                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3735                 Color_Diffuse[3] = 0.0f;
3736                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3737                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3738                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3739                 LightColor[3] = 0.0f;
3740                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3741                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3742                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3743                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3744                 Color_Specular[3] = 0.0f;
3745                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3746                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3747                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3748
3749                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3750                 {
3751                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3752                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3753                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3754                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3755                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3756                 }
3757                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3758                 {
3759                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3760                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3761                 }
3762                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3763                 {
3764                         // nothing of this needed
3765                 }
3766                 else
3767                 {
3768                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3769                 }
3770
3771                 for (x = startx;x < endx;x++)
3772                 {
3773                         z = buffer_z[x];
3774                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3775                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3776                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3777                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3778                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3779                         {
3780                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3781                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3782                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3783                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3784                         }
3785                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3786                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3787                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3788                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3789                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3790                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3791                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3792                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3793
3794                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3795                         {
3796                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3797                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3798                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3799                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3800
3801                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3802                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3803                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3804                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3805
3806                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3807                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3808                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3809                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3810
3811                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3812                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3813                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3814                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3815
3816                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3817                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3818
3819                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3820                                 {
3821                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3822                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3823                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3824                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3825                                 }
3826                         }
3827                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3828                         {
3829                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3830                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3831                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3832                                 {
3833                                         float f = 1.0f / 256.0f;
3834                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3835                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3836                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3837                                 }
3838                         }
3839                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3840                         {
3841                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3842                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3843                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3844                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3845
3846                                 LightColor[0] = 1.0;
3847                                 LightColor[1] = 1.0;
3848                                 LightColor[2] = 1.0;
3849                         }
3850                         else
3851                         {
3852                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3853                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3854                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3855                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3856                         }
3857
3858                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3859
3860                         if(thread->shader_exactspecularmath)
3861                         {
3862                                 // reflect lightnormal at surfacenormal, take the negative of that
3863                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3864                                 float f;
3865                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3866                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3867                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3868                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3869
3870                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3871                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3872                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3873                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3874                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3875
3876                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3877                         }
3878                         else
3879                         {
3880                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3881                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3882                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3883                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3884
3885                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3886                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3887                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3888                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3889
3890                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3891                         }
3892                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3893
3894                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3895                         {
3896                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3897                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3898                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3899                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3900                         }
3901                         else
3902                         {
3903                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3904                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3905                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3906                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3907                         }
3908
3909                         buffer_FragColorbgra8[x*4+0] = d[0];
3910                         buffer_FragColorbgra8[x*4+1] = d[1];
3911                         buffer_FragColorbgra8[x*4+2] = d[2];
3912                         buffer_FragColorbgra8[x*4+3] = d[3];
3913                 }
3914         }
3915         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3916         {
3917                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3918                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3919                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3920                 Color_Diffuse[3] = 0.0f;
3921                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3922                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3923                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3924                 LightColor[3] = 0.0f;
3925                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3926
3927                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3928                 {
3929                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3930                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3931                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3932                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3933                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3934                 }
3935                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3936                 {
3937                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3938                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3939                 }
3940                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3941                 {
3942                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3943                 }
3944                 else
3945                 {
3946                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3947                 }
3948
3949                 for (x = startx;x < endx;x++)
3950                 {
3951                         z = buffer_z[x];
3952                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3953                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3954                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3955                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3956                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3957                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3958                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3959                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3960
3961                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3962                         {
3963                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3964                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3965                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3966                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3967
3968                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3969                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3970                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3971                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3972
3973                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3974                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3975                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3976                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3977
3978                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3979                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3980                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3981                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3982
3983                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3984                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3985
3986                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3987                                 {
3988                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3989                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3990                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3991                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3992                                 }
3993                         }
3994                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3995                         {
3996                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3997                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3998                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3999                                 {
4000                                         float f = 1.0f / 256.0f;
4001                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4002                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4003                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4004                                 }
4005                         }
4006                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4007                         {
4008                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4009                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4010                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4011                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4012
4013                                 LightColor[0] = 1.0;
4014                                 LightColor[1] = 1.0;
4015                                 LightColor[2] = 1.0;
4016                         }
4017                         else
4018                         {
4019                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4020                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4021                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4022                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4023                         }
4024
4025                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4026                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4027                         {
4028                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4029                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4030                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4031                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4032                         }
4033                         else
4034                         {
4035                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4036                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4037                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4038                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4039                         }
4040                         buffer_FragColorbgra8[x*4+0] = d[0];
4041                         buffer_FragColorbgra8[x*4+1] = d[1];
4042                         buffer_FragColorbgra8[x*4+2] = d[2];
4043                         buffer_FragColorbgra8[x*4+3] = d[3];
4044                 }
4045         }
4046         else
4047         {
4048                 for (x = startx;x < endx;x++)
4049                 {
4050                         // z = buffer_z[x];
4051                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4052                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4053                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4054                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4055
4056                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4057                         {
4058                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4059                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4060                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4061                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4062                         }
4063                         else
4064                         {
4065                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4066                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4067                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4068                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4069                         }
4070                         buffer_FragColorbgra8[x*4+0] = d[0];
4071                         buffer_FragColorbgra8[x*4+1] = d[1];
4072                         buffer_FragColorbgra8[x*4+2] = d[2];
4073                         buffer_FragColorbgra8[x*4+3] = d[3];
4074                 }
4075         }
4076         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4077 }
4078
4079
4080
4081 static void DPSOFTRAST_VertexShader_LightSource(void)
4082 {
4083         int i;
4084         int numvertices = dpsoftrast.numvertices;
4085         float LightPosition[4];
4086         float LightVector[4];
4087         float LightVectorModelSpace[4];
4088         float EyePosition[4];
4089         float EyeVectorModelSpace[4];
4090         float EyeVector[4];
4091         float position[4];
4092         float svector[4];
4093         float tvector[4];
4094         float normal[4];
4095         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4096         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4097         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4098         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4099         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4100         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4101         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4102         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4103         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4104         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4105         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4106         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4107         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4108         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4109         for (i = 0;i < numvertices;i++)
4110         {
4111                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4112                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4113                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4114                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4115                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4116                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4117                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4118                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4119                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4120                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4121                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4122                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4123                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4124                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4125                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4126                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4127                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4128                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4129                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4130                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4131                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4132                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4133                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4134                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4135                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4136                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4137                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4138                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4139                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4140                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4141                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4142                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4143         }
4144         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4145         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4146 }
4147
4148 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4149 {
4150 #ifdef SSE_POSSIBLE
4151         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4152         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4153         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4154         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4155         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4156         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4157         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4158         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4159         int x, startx = span->startx, endx = span->endx;
4160         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4161         float CubeVectordata[4];
4162         float CubeVectorslope[4];
4163         float LightVectordata[4];
4164         float LightVectorslope[4];
4165         float EyeVectordata[4];
4166         float EyeVectorslope[4];
4167         float z;
4168         float diffusetex[4];
4169         float glosstex[4];
4170         float surfacenormal[4];
4171         float lightnormal[4];
4172         float eyenormal[4];
4173         float specularnormal[4];
4174         float diffuse;
4175         float specular;
4176         float SpecularPower;
4177         float CubeVector[4];
4178         float attenuation;
4179         int d[4];
4180         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4181         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4182         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4183         Color_Glow[3] = 0.0f;
4184         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4185         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4186         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4187         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4188         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4189         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4190         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4191         Color_Diffuse[3] = 0.0f;
4192         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4193         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4194         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4195         Color_Specular[3] = 0.0f;
4196         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4197         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4198         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4199         Color_Pants[3] = 0.0f;
4200         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4201         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4202         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4203         Color_Shirt[3] = 0.0f;
4204         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4205         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4206         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4207         LightColor[3] = 0.0f;
4208         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4209         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4210         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4211         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4212         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4213         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4214         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4215         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4216         {
4217                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4218                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4219         }
4220         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4221                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4222         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4223         {
4224                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4225                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4226                 for (x = startx;x < endx;x++)
4227                 {
4228                         z = buffer_z[x];
4229                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4230                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4231                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4232                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4233                         if (attenuation < 0.01f)
4234                                 continue;
4235                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4236                         {
4237                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4238                                 if (attenuation < 0.01f)
4239                                         continue;
4240                         }
4241
4242                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4243                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4244                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4245                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4246                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4247                         {
4248                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4249                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4250                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4251                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4252                         }
4253                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4254                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4255                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4256                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4257                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4258                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4259                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4260                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4261
4262                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4263                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4264                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4265                         DPSOFTRAST_Vector3Normalize(lightnormal);
4266
4267                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4268
4269                         if(thread->shader_exactspecularmath)
4270                         {
4271                                 // reflect lightnormal at surfacenormal, take the negative of that
4272                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4273                                 float f;
4274                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4275                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4276                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4277                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4278
4279                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4280                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4281                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4282                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4283                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4284
4285                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4286                         }
4287                         else
4288                         {
4289                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4290                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4291                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4292                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4293
4294                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4295                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4296                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4297                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4298
4299                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4300                         }
4301                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4302
4303                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4304                         {
4305                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4306                                 attenuation *= (1.0f / 255.0f);
4307                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4308                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4309                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4310                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4311                         }
4312                         else
4313                         {
4314                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4315                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4316                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4317                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4318                         }
4319                         buffer_FragColorbgra8[x*4+0] = d[0];
4320                         buffer_FragColorbgra8[x*4+1] = d[1];
4321                         buffer_FragColorbgra8[x*4+2] = d[2];
4322                         buffer_FragColorbgra8[x*4+3] = d[3];
4323                 }
4324         }
4325         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4326         {
4327                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4328                 for (x = startx;x < endx;x++)
4329                 {
4330                         z = buffer_z[x];
4331                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4332                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4333                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4334                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4335                         if (attenuation < 0.01f)
4336                                 continue;
4337                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4338                         {
4339                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4340                                 if (attenuation < 0.01f)
4341                                         continue;
4342                         }
4343
4344                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4345                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4346                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4347                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4348                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4349                         {
4350                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4351                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4352                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4353                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4354                         }
4355                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4356                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4357                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4358                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4359
4360                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4361                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4362                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4363                         DPSOFTRAST_Vector3Normalize(lightnormal);
4364
4365                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4366                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4367                         {
4368                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4369                                 attenuation *= (1.0f / 255.0f);
4370                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4371                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4372                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4373                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4374                         }
4375                         else
4376                         {
4377                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4378                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4379                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4380                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4381                         }
4382                         buffer_FragColorbgra8[x*4+0] = d[0];
4383                         buffer_FragColorbgra8[x*4+1] = d[1];
4384                         buffer_FragColorbgra8[x*4+2] = d[2];
4385                         buffer_FragColorbgra8[x*4+3] = d[3];
4386                 }
4387         }
4388         else
4389         {
4390                 for (x = startx;x < endx;x++)
4391                 {
4392                         z = buffer_z[x];
4393                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4394                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4395                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4396                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4397                         if (attenuation < 0.01f)
4398                                 continue;
4399                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4400                         {
4401                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4402                                 if (attenuation < 0.01f)
4403                                         continue;
4404                         }
4405
4406                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4407                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4408                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4409                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4410                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4411                         {
4412                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4413                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4414                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4415                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4416                         }
4417                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4418                         {
4419                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4420                                 attenuation *= (1.0f / 255.0f);
4421                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4422                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4423                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4424                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4425                         }
4426                         else
4427                         {
4428                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4429                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4430                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4431                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4432                         }
4433                         buffer_FragColorbgra8[x*4+0] = d[0];
4434                         buffer_FragColorbgra8[x*4+1] = d[1];
4435                         buffer_FragColorbgra8[x*4+2] = d[2];
4436                         buffer_FragColorbgra8[x*4+3] = d[3];
4437                 }
4438         }
4439         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4440 #endif
4441 }
4442
4443
4444
4445 static void DPSOFTRAST_VertexShader_Refraction(void)
4446 {
4447         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4448         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4449         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4450 }
4451
4452 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4453 {
4454         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4455         float z;
4456         int x, startx = span->startx, endx = span->endx;
4457
4458         // texture reads
4459         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4460         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4461
4462         // varyings
4463         float ModelViewProjectionPositiondata[4];
4464         float ModelViewProjectionPositionslope[4];
4465
4466         // uniforms
4467         float ScreenScaleRefractReflect[2];
4468         float ScreenCenterRefractReflect[2];
4469         float DistortScaleRefractReflect[2];
4470         float RefractColor[4];
4471
4472         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4473         if(!texture) return;
4474
4475         // read textures
4476         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4477         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4478
4479         // read varyings
4480         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4481
4482         // read uniforms
4483         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4484         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4485         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4486         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4487         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4488         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4489         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4490         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4491         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4492         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4493
4494         // do stuff
4495         for (x = startx;x < endx;x++)
4496         {
4497                 float SafeScreenTexCoord[2];
4498                 float ScreenTexCoord[2];
4499                 float v[3];
4500                 float iw;
4501                 unsigned char c[4];
4502
4503                 z = buffer_z[x];
4504
4505                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4506                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4507
4508                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4509                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4510                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4511
4512                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4513                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4514                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4515                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4516                 DPSOFTRAST_Vector3Normalize(v);
4517                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4518                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4519
4520                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4521                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4522
4523                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4524                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4525                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4526                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4527         }
4528
4529         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4530 }
4531
4532
4533
4534 static void DPSOFTRAST_VertexShader_Water(void)
4535 {
4536         int i;
4537         int numvertices = dpsoftrast.numvertices;
4538         float EyePosition[4];
4539         float EyeVectorModelSpace[4];
4540         float EyeVector[4];
4541         float position[4];
4542         float svector[4];
4543         float tvector[4];
4544         float normal[4];
4545         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4546         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4547         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4548         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4549         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4550         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4551         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4552         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4553         for (i = 0;i < numvertices;i++)
4554         {
4555                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4556                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4557                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4558                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4559                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4560                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4561                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4562                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4563                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4564                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4565                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4566                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4567                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4568                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4569                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4570                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4571                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4572                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4573                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4574                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4575                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4576                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4577         }
4578         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4579         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4580         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4581 }
4582
4583
4584 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4585 {
4586         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4587         // float z;
4588         int x, startx = span->startx, endx = span->endx;
4589
4590         // texture reads
4591         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4592         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4593
4594         // varyings
4595         float ModelViewProjectionPositiondata[4];
4596         float ModelViewProjectionPositionslope[4];
4597         float EyeVectordata[4];
4598         float EyeVectorslope[4];
4599
4600         // uniforms
4601         float ScreenScaleRefractReflect[4];
4602         float ScreenCenterRefractReflect[4];
4603         float DistortScaleRefractReflect[4];
4604         float RefractColor[4];
4605         float ReflectColor[4];
4606         float ReflectFactor;
4607         float ReflectOffset;
4608
4609         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4610         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4611         if(!texture_refraction || !texture_reflection) return;
4612
4613         // read textures
4614         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4615         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4616
4617         // read varyings
4618         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4619         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4620
4621         // read uniforms
4622         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4623         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4624         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4625         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4626         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4627         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4628         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4629         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4630         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4631         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4632         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4633         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4634         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4635         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4636         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4637         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4638         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4639         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4640         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4641         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4642         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4643         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4644
4645         // do stuff
4646         for (x = startx;x < endx;x++)
4647         {
4648                 float SafeScreenTexCoord[4];
4649                 float ScreenTexCoord[4];
4650                 float v[3];
4651                 float iw;
4652                 unsigned char c1[4];
4653                 unsigned char c2[4];
4654                 float Fresnel;
4655
4656                 // z = buffer_z[x];
4657
4658                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4659                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4660
4661                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4662                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4663                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4664                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4665                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4666
4667                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4668                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4669                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4670                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4671                 DPSOFTRAST_Vector3Normalize(v);
4672                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4673                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4674                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4675                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4676
4677                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4678                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4679                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4680                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4681                 DPSOFTRAST_Vector3Normalize(v);
4682                 Fresnel = 1.0f - v[2];
4683                 Fresnel = min(1.0f, Fresnel);
4684                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4685
4686                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4687                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4688                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4689                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4690
4691                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4692                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4693                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4694                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4695         }
4696
4697         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4698 }
4699
4700
4701
4702 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4703 {
4704         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4705 }
4706
4707 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4708 {
4709         // TODO: IMPLEMENT
4710         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4711         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4712         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4713         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4714         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4715 }
4716
4717
4718
4719 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4720 {
4721         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4722 }
4723
4724 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4725 {
4726         // TODO: IMPLEMENT
4727         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4728         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4729         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4730         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4731         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4732 }
4733
4734
4735
4736 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4737 {
4738         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4739 }
4740
4741 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4742 {
4743         // TODO: IMPLEMENT
4744         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4745         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4746         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4747         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4748         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4749 }
4750
4751
4752
4753 typedef struct DPSOFTRAST_ShaderModeInfo_s
4754 {
4755         int lodarrayindex;
4756         void (*Vertex)(void);
4757         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4758         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4759         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4760 }
4761 DPSOFTRAST_ShaderModeInfo;
4762
4763 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4764 {
4765         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4766         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4767         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4768         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4769         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4770         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4771         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4772         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4773         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4774         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4775         {2, DPSOFTRAST_VertexShader_VertexColor,                        DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4776         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4777         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4778         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4779         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4780         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4781         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4782         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4783 };
4784
4785 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4786 {
4787         int x;
4788         int startx;
4789         int endx;
4790         unsigned int *depthpixel;
4791         int depth;
4792         int depthslope;
4793         unsigned int d;
4794         unsigned char *pixelmask;
4795         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4796         startx = span->startx;
4797         endx = span->endx;
4798         depth = span->depthbase;
4799         depthslope = span->depthslope;
4800         pixelmask = thread->pixelmaskarray;
4801         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4802         {
4803                 switch(thread->fb_depthfunc)
4804                 {
4805                 default:
4806                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4807                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4808                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4809                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4810                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4811                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4812                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4813                 }
4814                 while (startx < endx && !pixelmask[startx])
4815                         startx++;
4816                 while (endx > startx && !pixelmask[endx-1])
4817                         endx--;
4818         }
4819         else
4820         {
4821                 // no depth testing means we're just dealing with color...
4822                 memset(pixelmask + startx, 1, endx - startx);
4823         }
4824         span->pixelmask = pixelmask;
4825         span->startx = startx;
4826         span->endx = endx;
4827 }
4828
4829 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4830 {
4831         int x, d, depth, depthslope, startx, endx;
4832         const unsigned char *pixelmask;
4833         unsigned int *depthpixel;
4834         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4835         {
4836                 depth = span->depthbase;
4837                 depthslope = span->depthslope;
4838                 pixelmask = span->pixelmask;
4839                 startx = span->startx;
4840                 endx = span->endx;
4841                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4842                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4843                         if (pixelmask[x])
4844                                 depthpixel[x] = d;
4845         }
4846 }
4847
4848 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4849 {
4850         int i;
4851         DPSOFTRAST_State_Triangle *triangle;
4852         DPSOFTRAST_State_Span *span;
4853         for (i = 0; i < thread->numspans; i++)
4854         {
4855                 span = &thread->spans[i];
4856                 triangle = &thread->triangles[span->triangle];
4857                 DPSOFTRAST_Draw_DepthTest(thread, span);
4858                 if (span->startx >= span->endx)
4859                         continue;
4860                 // run pixel shader if appropriate
4861                 // do this before running depthmask code, to allow the pixelshader
4862                 // to clear pixelmask values for alpha testing
4863                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4864                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4865                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4866         }
4867         thread->numspans = 0;
4868 }
4869
4870 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4871
4872 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4873 {
4874 #ifdef SSE_POSSIBLE
4875         int cullface = thread->cullface;
4876         int minx, maxx, miny, maxy;
4877         int miny1, maxy1, miny2, maxy2;
4878         __m128i fbmin, fbmax;
4879         __m128 viewportcenter, viewportscale;
4880         int firstvertex = command->firstvertex;
4881         int numvertices = command->numvertices;
4882         int numtriangles = command->numtriangles;
4883         const int *element3i = command->element3i;
4884         const unsigned short *element3s = command->element3s;
4885         int clipped = command->clipped;
4886         int i;
4887         int j;
4888         int k;
4889         int y;
4890         int e[3];
4891         __m128i screeny;
4892         int starty, endy, bandy;
4893         int numpoints;
4894         int clipcase;
4895         float clipdist[4];
4896         float clip0origin, clip0slope;
4897         int clip0dir;
4898         __m128 triangleedge1, triangleedge2, trianglenormal;
4899         __m128 clipfrac[3];
4900         __m128 screen[4];
4901         DPSOFTRAST_State_Triangle *triangle;
4902         DPSOFTRAST_Texture *texture;
4903         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4904         miny = thread->fb_scissor[1];
4905         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4906         miny1 = bound(miny, thread->miny1, maxy);
4907         maxy1 = bound(miny, thread->maxy1, maxy);
4908         miny2 = bound(miny, thread->miny2, maxy);
4909         maxy2 = bound(miny, thread->maxy2, maxy);
4910         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4911         {
4912                 if (!ATOMIC_DECREMENT(command->refcount))
4913                 {
4914                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4915                                 MM_FREE(command->arrays);
4916                 }
4917                 return;
4918         }
4919         minx = thread->fb_scissor[0];
4920         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4921         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4922         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4923         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4924         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4925         screen[3] = _mm_setzero_ps();
4926         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4927         for (i = 0;i < numtriangles;i++)
4928         {
4929                 const float *screencoord4f = command->arrays;
4930                 const float *arrays = screencoord4f + numvertices*4;
4931
4932                 // generate the 3 edges of this triangle
4933                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4934                 if (element3s)
4935                 {
4936                         e[0] = element3s[i*3+0] - firstvertex;
4937                         e[1] = element3s[i*3+1] - firstvertex;
4938                         e[2] = element3s[i*3+2] - firstvertex;
4939                 }
4940                 else if (element3i)
4941                 {
4942                         e[0] = element3i[i*3+0] - firstvertex;
4943                         e[1] = element3i[i*3+1] - firstvertex;
4944                         e[2] = element3i[i*3+2] - firstvertex;
4945                 }
4946                 else
4947                 {
4948                         e[0] = i*3+0;
4949                         e[1] = i*3+1;
4950                         e[2] = i*3+2;
4951                 }
4952
4953 #define SKIPBACKFACE \
4954                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4955                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4956                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4957                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4958                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4959                 switch(cullface) \
4960                 { \
4961                 case GL_BACK: \
4962                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4963                                 continue; \
4964                         break; \
4965                 case GL_FRONT: \
4966                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4967                                 continue; \
4968                         break; \
4969                 }
4970
4971 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4972                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4973                         { \
4974                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4975                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4976                         }
4977 #define CLIPPEDVERTEXCOPY(k,p1) \
4978                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4979
4980 #define GENATTRIBCOPY(attrib, p1) \
4981                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4982 #define GENATTRIBLERP(attrib, p1, p2) \
4983                 { \
4984                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4985                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4986                 }
4987 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4988                 switch(clipcase) \
4989                 { \
4990                 default: \
4991                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4992                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4993                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4994                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4995                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4996                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4997                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4998                 }
4999
5000                 if (! clipped)
5001                         goto notclipped;
5002
5003                 // calculate distance from nearplane
5004                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5005                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5006                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5007                 if (clipdist[0] >= 0.0f)
5008                 {
5009                         if (clipdist[1] >= 0.0f)
5010                         {
5011                                 if (clipdist[2] >= 0.0f)
5012                                 {
5013                                 notclipped:
5014                                         // triangle is entirely in front of nearplane
5015                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5016                                         SKIPBACKFACE;
5017                                         numpoints = 3;
5018                                         clipcase = 0;
5019                                 }
5020                                 else
5021                                 {
5022                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5023                                         SKIPBACKFACE;
5024                                         numpoints = 4;
5025                                         clipcase = 1;
5026                                 }
5027                         }
5028                         else
5029                         {
5030                                 if (clipdist[2] >= 0.0f)
5031                                 {
5032                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5033                                         SKIPBACKFACE;
5034                                         numpoints = 4;
5035                                         clipcase = 2;
5036                                 }
5037                                 else
5038                                 {
5039                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5040                                         SKIPBACKFACE;
5041                                         numpoints = 3;
5042                                         clipcase = 3;
5043                                 }
5044                         }
5045                 }
5046                 else if (clipdist[1] >= 0.0f)
5047                 {
5048                         if (clipdist[2] >= 0.0f)
5049                         {
5050                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5051                                 SKIPBACKFACE;
5052                                 numpoints = 4;
5053                                 clipcase = 4;
5054                         }
5055                         else
5056                         {
5057                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5058                                 SKIPBACKFACE;
5059                                 numpoints = 3;
5060                                 clipcase = 5;
5061                         }
5062                 }
5063                 else if (clipdist[2] >= 0.0f)
5064                 {
5065                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5066                         SKIPBACKFACE;
5067                         numpoints = 3;
5068                         clipcase = 6;
5069                 }
5070                 else continue; // triangle is entirely behind nearplane
5071
5072                 {
5073                         // calculate integer y coords for triangle points
5074                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5075                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5076                                         screenmin = _mm_min_epi16(screeni, screenir),
5077                                         screenmax = _mm_max_epi16(screeni, screenir);
5078                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5079                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5080                         screenmin = _mm_max_epi16(screenmin, fbmin);
5081                         screenmax = _mm_min_epi16(screenmax, fbmax);
5082                         // skip offscreen triangles
5083                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5084                                 continue;
5085                         starty = _mm_extract_epi16(screenmin, 1);
5086                         endy = _mm_extract_epi16(screenmax, 1)+1;
5087                         if (starty >= maxy1 && endy <= miny2)
5088                                 continue;
5089                         screeny = _mm_srai_epi32(screeni, 16);
5090                 }
5091
5092                 triangle = &thread->triangles[thread->numtriangles];
5093
5094                 // calculate attribute plans for triangle data...
5095                 // okay, this triangle is going to produce spans, we'd better project
5096                 // the interpolants now (this is what gives perspective texturing),
5097                 // this consists of simply multiplying all arrays by the W coord
5098                 // (which is basically 1/Z), which will be undone per-pixel
5099                 // (multiplying by Z again) to get the perspective-correct array
5100                 // values
5101                 {
5102                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5103                         __m128 mipedgescale, mipdensity;
5104                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5105                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5106                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5107                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5108                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5109                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5110                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5111                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5112                         attribedge1 = _mm_sub_ss(w0, w1);
5113                         attribedge2 = _mm_sub_ss(w2, w1);
5114                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5115                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5116                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5117                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5118                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5119                         _mm_store_ss(&triangle->w[0], attribxslope);
5120                         _mm_store_ss(&triangle->w[1], attribyslope);
5121                         _mm_store_ss(&triangle->w[2], attriborigin);
5122                         
5123                         clip0origin = 0;
5124                         clip0slope = 0;
5125                         clip0dir = 0;
5126                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5127                         {
5128                                 float cliporigin, clipxslope, clipyslope;
5129                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5130                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5131                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5132                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5133                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5134                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5135                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5136                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5137                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5138                                 if(clipxslope != 0)
5139                                 {
5140                                         clip0origin = -cliporigin/clipxslope;
5141                                         clip0slope = -clipyslope/clipxslope;
5142                                         clip0dir = clipxslope > 0 ? 1 : -1;
5143                                 }
5144                                 else if(clipyslope > 0)
5145                                 {
5146                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5147                                         clip0slope = dpsoftrast.fb_width;
5148                                         clip0dir = -1;
5149                                 }
5150                                 else if(clipyslope < 0)
5151                                 {
5152                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5153                                         clip0slope = -dpsoftrast.fb_width;
5154                                         clip0dir = -1;
5155                                 }
5156                                 else if(clip0origin < 0) continue;
5157                         }
5158
5159                         mipedgescale = _mm_setzero_ps();
5160                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5161                         {
5162                                 __m128 attrib0, attrib1, attrib2;
5163                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5164                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5165                                         break;
5166                                 arrays += numvertices*4;
5167                                 GENATTRIBS(attrib0, attrib1, attrib2);
5168                                 attriborigin = _mm_mul_ps(attrib1, w1);
5169                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5170                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5171                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5172                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5173                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5174                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5175                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5176                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5177                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5178                                 {
5179                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5180                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5181                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5182                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5183                                 }
5184                         }
5185
5186                         memset(triangle->mip, 0, sizeof(triangle->mip));
5187                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5188                         {
5189                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5190                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5191                                         break;
5192                                 texture = thread->texbound[texunit];
5193                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5194                                 {
5195                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5196                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5197                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5198                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5199                                         // this will be multiplied in the texturing routine by the texture resolution
5200                                         y = _mm_cvtss_si32(mipdensity);
5201                                         if (y > 0)
5202                                         {
5203                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5204                                                 if (y > texture->mipmaps - 1)
5205                                                         y = texture->mipmaps - 1;
5206                                                 triangle->mip[texunit] = y;
5207                                         }
5208                                 }
5209                         }
5210                 }
5211         
5212                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5213                 for (; y < bandy;)
5214                 {
5215                         __m128 xcoords, xslope;
5216                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5217                         int yccmask = _mm_movemask_epi8(ycc);
5218                         int edge0p, edge0n, edge1p, edge1n;
5219                         int nexty;
5220                         float w, wslope;
5221                         float clip0;
5222                         if (numpoints == 4)
5223                         {
5224                                 switch(yccmask)
5225                                 {
5226                                 default:
5227                                 case 0xFFFF: /*0000*/ y = endy; continue;
5228                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5229                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5230                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5231                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5232                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5233                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5234                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5235                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5236                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5237                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5238                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5239                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5240                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5241                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5242                                 case 0x0000: /*1111*/ y++; continue;
5243                                 }
5244                         }
5245                         else
5246                         {
5247                                 switch(yccmask)
5248                                 {
5249                                 default:
5250                                 case 0xFFFF: /*000*/ y = endy; continue;
5251                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5252                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5253                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5254                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5255                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5256                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5257                                 case 0x0000: /*111*/ y++; continue;
5258                                 }
5259                         }
5260                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5261                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5262                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5263                         nexty = _mm_extract_epi16(ycc, 0);
5264                         if (nexty >= bandy) nexty = bandy-1;
5265                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5266                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5267                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5268                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5269                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5270                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5271                         {
5272                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5273                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5274                         }
5275                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5276                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5277                         {
5278                                 int startx, endx, offset;
5279                                 startx = _mm_cvtss_si32(xcoords);
5280                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5281                                 if (startx < minx) startx = minx;
5282                                 if (endx > maxx) endx = maxx;
5283                                 if (startx >= endx) continue;
5284
5285                                 if (clip0dir)
5286                                 {
5287                                         if (clip0dir > 0)
5288                                         {
5289                                                 if (startx < clip0) 
5290                                                 {
5291                                                         if(endx <= clip0) continue;
5292                                                         startx = (int)clip0;
5293                                                 }
5294                                         }
5295                                         else if (endx > clip0) 
5296                                         {
5297                                                 if(startx >= clip0) continue;
5298                                                 endx = (int)clip0;
5299                                         }
5300                                 }
5301                                                 
5302                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5303                                 {
5304                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5305                                         span->triangle = thread->numtriangles;
5306                                         span->x = offset;
5307                                         span->y = y;
5308                                         span->startx = 0;
5309                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5310                                         if (span->startx >= span->endx)
5311                                                 continue;
5312                                         wslope = triangle->w[0];
5313                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5314                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5315                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5316                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5317                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5318                                 }
5319                         }
5320                 }
5321
5322                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5323                 {
5324                         DPSOFTRAST_Draw_ProcessSpans(thread);
5325                         thread->numtriangles = 0;
5326                 }
5327         }
5328
5329         if (!ATOMIC_DECREMENT(command->refcount))
5330         {
5331                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5332                         MM_FREE(command->arrays);
5333         }
5334
5335         if (thread->numspans > 0 || thread->numtriangles > 0)
5336         {
5337                 DPSOFTRAST_Draw_ProcessSpans(thread);
5338                 thread->numtriangles = 0;
5339         }
5340 #endif
5341 }
5342
5343 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5344 {
5345         int i;
5346         int j;
5347         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5348         int datasize = 2*numvertices*sizeof(float[4]);
5349         DPSOFTRAST_Command_Draw *command;
5350         unsigned char *data;
5351         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5352         {
5353                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5354                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5355                         break;
5356                 datasize += numvertices*sizeof(float[4]);
5357         }
5358         if (element3s)
5359                 datasize += numtriangles*sizeof(unsigned short[3]);
5360         else if (element3i)
5361                 datasize += numtriangles*sizeof(int[3]);
5362         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5363         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5364         {
5365                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5366                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5367         }
5368         else
5369         {
5370                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5371                 data = (unsigned char *)command + commandsize;
5372         }
5373         command->firstvertex = firstvertex;
5374         command->numvertices = numvertices;
5375         command->numtriangles = numtriangles;
5376         command->arrays = (float *)data;
5377         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5378         dpsoftrast.firstvertex = firstvertex;
5379         dpsoftrast.numvertices = numvertices;
5380         dpsoftrast.screencoord4f = (float *)data;
5381         data += numvertices*sizeof(float[4]);
5382         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5383         data += numvertices*sizeof(float[4]);
5384         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5385         {
5386                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5387                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5388                         break;
5389                 dpsoftrast.post_array4f[j] = (float *)data;
5390                 data += numvertices*sizeof(float[4]);
5391         }
5392         command->element3i = NULL;
5393         command->element3s = NULL;
5394         if (element3s)
5395         {
5396                 command->element3s = (unsigned short *)data;
5397                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5398         }
5399         else if (element3i)
5400         {
5401                 command->element3i = (int *)data;
5402                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5403         }
5404         return command;
5405 }
5406
5407 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5408 {
5409         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5410         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5411         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5412         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5413         if (command->starty >= command->endy)
5414         {
5415                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5416                         MM_FREE(command->arrays);
5417                 DPSOFTRAST_UndoCommand(command->commandsize);
5418                 return;
5419         }
5420         command->clipped = dpsoftrast.drawclipped;
5421         command->refcount = dpsoftrast.numthreads;
5422
5423         if (dpsoftrast.usethreads)
5424         {
5425                 int i;
5426                 DPSOFTRAST_Draw_SyncCommands();
5427                 for (i = 0; i < dpsoftrast.numthreads; i++)
5428                 {
5429                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5430                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5431                                 Thread_CondSignal(thread->drawcond);
5432                 }
5433         }
5434         else
5435         {
5436                 DPSOFTRAST_Draw_FlushThreads();
5437         }
5438 }
5439
5440 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5441 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5442 {
5443         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5444 }
5445 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5446 {
5447         DPSOFTRAST_Command_SetRenderTargets *command;
5448         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5449                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5450                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5451                 DPSOFTRAST_Flush();
5452         dpsoftrast.fb_width = width;
5453         dpsoftrast.fb_height = height;
5454         dpsoftrast.fb_depthpixels = depthpixels;
5455         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5456         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5457         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5458         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5459         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5460         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5461         command->width = width;
5462         command->height = height;
5463 }
5464  
5465 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5466 {
5467         int commandoffset = thread->commandoffset;
5468         while (commandoffset != endoffset)
5469         {
5470                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5471                 switch (command->opcode)
5472                 {
5473 #define INTERPCOMMAND(name) \
5474                 case DPSOFTRAST_OPCODE_##name : \
5475                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5476                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5477                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5478                                 commandoffset = 0; \
5479                         break;
5480                 INTERPCOMMAND(Viewport)
5481                 INTERPCOMMAND(ClearColor)
5482                 INTERPCOMMAND(ClearDepth)
5483                 INTERPCOMMAND(ColorMask)
5484                 INTERPCOMMAND(DepthTest)
5485                 INTERPCOMMAND(ScissorTest)
5486                 INTERPCOMMAND(Scissor)
5487                 INTERPCOMMAND(BlendFunc)
5488                 INTERPCOMMAND(BlendSubtract)
5489                 INTERPCOMMAND(DepthMask)
5490                 INTERPCOMMAND(DepthFunc)
5491                 INTERPCOMMAND(DepthRange)
5492                 INTERPCOMMAND(PolygonOffset)
5493                 INTERPCOMMAND(CullFace)
5494                 INTERPCOMMAND(SetTexture)
5495                 INTERPCOMMAND(SetShader)
5496                 INTERPCOMMAND(Uniform4f)
5497                 INTERPCOMMAND(UniformMatrix4f)
5498                 INTERPCOMMAND(Uniform1i)
5499                 INTERPCOMMAND(SetRenderTargets)
5500                 INTERPCOMMAND(ClipPlane)
5501
5502                 case DPSOFTRAST_OPCODE_Draw:
5503                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5504                         commandoffset += command->commandsize;
5505                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5506                                 commandoffset = 0;
5507                         thread->commandoffset = commandoffset;
5508                         break;
5509
5510                 case DPSOFTRAST_OPCODE_Reset:
5511                         commandoffset = 0;
5512                         break;
5513                 }
5514         }
5515         thread->commandoffset = commandoffset;
5516 }
5517
5518 static int DPSOFTRAST_Draw_Thread(void *data)
5519 {
5520         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5521         while(thread->index >= 0)
5522         {
5523                 if (thread->commandoffset != dpsoftrast.drawcommand)
5524                 {
5525                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5526                 }
5527                 else 
5528                 {
5529                         Thread_LockMutex(thread->drawmutex);
5530                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5531                         {
5532                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5533                                 thread->starving = true;
5534                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5535                                 thread->starving = false;
5536                         }
5537                         Thread_UnlockMutex(thread->drawmutex);
5538                 }
5539         }   
5540         return 0;
5541 }
5542
5543 static void DPSOFTRAST_Draw_FlushThreads(void)
5544 {
5545         DPSOFTRAST_State_Thread *thread;
5546         int i;
5547         DPSOFTRAST_Draw_SyncCommands();
5548         if (dpsoftrast.usethreads) 
5549         {
5550                 for (i = 0; i < dpsoftrast.numthreads; i++)
5551                 {
5552                         thread = &dpsoftrast.threads[i];
5553                         if (thread->commandoffset != dpsoftrast.drawcommand)
5554                         {
5555                                 Thread_LockMutex(thread->drawmutex);
5556                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5557                                         Thread_CondSignal(thread->drawcond);
5558                                 Thread_UnlockMutex(thread->drawmutex);
5559                         }
5560                 }
5561                 for (i = 0; i < dpsoftrast.numthreads; i++)
5562                 {
5563                         thread = &dpsoftrast.threads[i];
5564                         if (thread->commandoffset != dpsoftrast.drawcommand)
5565                         {
5566                                 Thread_LockMutex(thread->drawmutex);
5567                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5568                                 {
5569                                         thread->waiting = true;
5570                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5571                                         thread->waiting = false;
5572                                 }
5573                                 Thread_UnlockMutex(thread->drawmutex);
5574                         }
5575                 }
5576         }
5577         else
5578         {
5579                 for (i = 0; i < dpsoftrast.numthreads; i++)
5580                 {
5581                         thread = &dpsoftrast.threads[i];
5582                         if (thread->commandoffset != dpsoftrast.drawcommand)
5583                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5584                 }
5585         }
5586         dpsoftrast.commandpool.usedcommands = 0;
5587 }
5588
5589 void DPSOFTRAST_Flush(void)
5590 {
5591         DPSOFTRAST_Draw_FlushThreads();
5592 }
5593
5594 void DPSOFTRAST_Finish(void)
5595 {
5596         DPSOFTRAST_Flush();
5597 }
5598
5599 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5600 {
5601         int i;
5602         union
5603         {
5604                 int i;
5605                 unsigned char b[4];
5606         }
5607         u;
5608         u.i = 1;
5609         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5610         dpsoftrast.bigendian = u.b[3];
5611         dpsoftrast.fb_width = width;
5612         dpsoftrast.fb_height = height;
5613         dpsoftrast.fb_depthpixels = depthpixels;
5614         dpsoftrast.fb_colorpixels[0] = colorpixels;
5615         dpsoftrast.fb_colorpixels[1] = NULL;
5616         dpsoftrast.fb_colorpixels[1] = NULL;
5617         dpsoftrast.fb_colorpixels[1] = NULL;
5618         dpsoftrast.viewport[0] = 0;
5619         dpsoftrast.viewport[1] = 0;
5620         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5621         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5622         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5623         dpsoftrast.texture_firstfree = 1;
5624         dpsoftrast.texture_end = 1;
5625         dpsoftrast.texture_max = 0;
5626         dpsoftrast.color[0] = 1;
5627         dpsoftrast.color[1] = 1;
5628         dpsoftrast.color[2] = 1;
5629         dpsoftrast.color[3] = 1;
5630         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5631         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5632         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5633         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5634         for (i = 0; i < dpsoftrast.numthreads; i++)
5635         {
5636                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5637                 thread->index = i;
5638                 thread->cullface = GL_BACK;
5639         thread->colormask[0] = 1; 
5640                 thread->colormask[1] = 1;
5641                 thread->colormask[2] = 1;
5642                 thread->colormask[3] = 1;
5643                 thread->blendfunc[0] = GL_ONE;
5644                 thread->blendfunc[1] = GL_ZERO;
5645                 thread->depthmask = true;
5646                 thread->depthtest = true;
5647                 thread->depthfunc = GL_LEQUAL;
5648                 thread->scissortest = false;
5649                 thread->viewport[0] = 0;
5650                 thread->viewport[1] = 0;
5651                 thread->viewport[2] = dpsoftrast.fb_width;
5652                 thread->viewport[3] = dpsoftrast.fb_height;
5653                 thread->scissor[0] = 0;
5654                 thread->scissor[1] = 0;
5655                 thread->scissor[2] = dpsoftrast.fb_width;
5656                 thread->scissor[3] = dpsoftrast.fb_height;
5657                 thread->depthrange[0] = 0;
5658                 thread->depthrange[1] = 1;
5659                 thread->polygonoffset[0] = 0;
5660                 thread->polygonoffset[1] = 0;
5661                 thread->clipplane[0] = 0;
5662                 thread->clipplane[1] = 0;
5663                 thread->clipplane[2] = 0;
5664                 thread->clipplane[3] = 1;
5665         
5666                 thread->numspans = 0;
5667                 thread->numtriangles = 0;
5668                 thread->commandoffset = 0;
5669                 thread->waiting = false;
5670                 thread->starving = false;
5671            
5672                 thread->validate = -1;
5673                 DPSOFTRAST_Validate(thread, -1);
5674  
5675                 if (dpsoftrast.usethreads)
5676                 {
5677                         thread->waitcond = Thread_CreateCond();
5678                         thread->drawcond = Thread_CreateCond();
5679                         thread->drawmutex = Thread_CreateMutex();
5680                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5681                 }
5682         }
5683         return 0;
5684 }
5685
5686 void DPSOFTRAST_Shutdown(void)
5687 {
5688         int i;
5689         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5690         {
5691                 DPSOFTRAST_State_Thread *thread;
5692                 for (i = 0; i < dpsoftrast.numthreads; i++)
5693                 {
5694                         thread = &dpsoftrast.threads[i];
5695                         Thread_LockMutex(thread->drawmutex);
5696                         thread->index = -1;
5697                         Thread_CondSignal(thread->drawcond);
5698                         Thread_UnlockMutex(thread->drawmutex);
5699                         Thread_WaitThread(thread->thread, 0);
5700                         Thread_DestroyCond(thread->waitcond);
5701                         Thread_DestroyCond(thread->drawcond);
5702                         Thread_DestroyMutex(thread->drawmutex);
5703                 }
5704         }
5705         for (i = 0;i < dpsoftrast.texture_end;i++)
5706                 if (dpsoftrast.texture[i].bytes)
5707                         MM_FREE(dpsoftrast.texture[i].bytes);
5708         if (dpsoftrast.texture)
5709                 free(dpsoftrast.texture);
5710         if (dpsoftrast.threads)
5711                 MM_FREE(dpsoftrast.threads);
5712         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5713 }
5714