]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
fix clang warnings for unused result of *_LockThreadMutex
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int viewport[4];
253         int scissor[4];
254         float depthrange[2];
255         float polygonoffset[2];
256         float clipplane[4];
257         ALIGN(float fb_clipplane[4]);
258
259         int shader_mode;
260         int shader_permutation;
261         int shader_exactspecularmath;
262
263         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
264         
265         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
267
268         // DPSOFTRAST_VALIDATE_ flags
269         int validate;
270
271         // derived values (DPSOFTRAST_VALIDATE_FB)
272         int fb_colormask;
273         int fb_scissor[4];
274         ALIGN(float fb_viewportcenter[4]);
275         ALIGN(float fb_viewportscale[4]);
276
277         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278         int fb_depthfunc;
279
280         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
281         int fb_blendmode;
282
283         // band boundaries
284         int miny1;
285         int maxy1;
286         int miny2;
287         int maxy2;
288
289         ATOMIC(volatile int commandoffset);
290
291         volatile bool waiting;
292         volatile bool starving;
293         void *waitcond;
294         void *drawcond;
295         void *drawmutex;
296
297         int numspans;
298         int numtriangles;
299         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
302 }
303 DPSOFTRAST_State_Thread);
304
305 typedef ALIGN(struct DPSOFTRAST_State_s
306 {
307         int fb_width;
308         int fb_height;
309         unsigned int *fb_depthpixels;
310         unsigned int *fb_colorpixels[4];
311
312         int viewport[4];
313         ALIGN(float fb_viewportcenter[4]);
314         ALIGN(float fb_viewportscale[4]);
315
316         float color[4];
317         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
319
320         const float *pointer_vertex3f;
321         const float *pointer_color4f;
322         const unsigned char *pointer_color4ub;
323         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324         int stride_vertex;
325         int stride_color;
326         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
329
330         int firstvertex;
331         int numvertices;
332         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333         float *screencoord4f;
334         int drawstarty;
335         int drawendy;
336         int drawclipped;
337         
338         int shader_mode;
339         int shader_permutation;
340         int shader_exactspecularmath;
341
342         int texture_max;
343         int texture_end;
344         int texture_firstfree;
345         DPSOFTRAST_Texture *texture;
346
347         int bigendian;
348
349         // error reporting
350         const char *errorstring;
351
352         bool usethreads;
353         int interlace;
354         int numthreads;
355         DPSOFTRAST_State_Thread *threads;
356
357         ATOMIC(volatile int drawcommand);
358
359         DPSOFTRAST_State_Command_Pool commandpool;
360 }
361 DPSOFTRAST_State);
362
363 DPSOFTRAST_State dpsoftrast;
364
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
369
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
372
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
374 {
375         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377         fb_viewportcenter[3] = 0.5f;
378         fb_viewportcenter[0] = 0.0f;
379         fb_viewportscale[1] = 0.5f * viewport[2];
380         fb_viewportscale[2] = -0.5f * viewport[3];
381         fb_viewportscale[3] = 0.5f;
382         fb_viewportscale[0] = 1.0f;
383 }
384
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
386 {
387         if (dpsoftrast.interlace)
388         {
389                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393         }
394         else
395         {
396                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
398         }
399 }
400
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
402 {
403         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 }
409
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
411 {
412         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413         // and viewport projection values
414         int x1, x2;
415         int y1, y2;
416         x1 = thread->scissor[0];
417         x2 = thread->scissor[0] + thread->scissor[2];
418         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419         y2 = dpsoftrast.fb_height - thread->scissor[1];
420         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
421         if (x1 < 0) x1 = 0;
422         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
423         if (y1 < 0) y1 = 0;
424         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425         thread->fb_scissor[0] = x1;
426         thread->fb_scissor[1] = y1;
427         thread->fb_scissor[2] = x2 - x1;
428         thread->fb_scissor[3] = y2 - y1;
429
430         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431         DPSOFTRAST_RecalcClipPlane(thread);
432         DPSOFTRAST_RecalcThread(thread);
433 }
434
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
436 {
437         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 }
439
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
441 {
442         if (thread->blendsubtract)
443         {
444                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445                 {
446                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450                 }
451         }
452         else
453         {       
454                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
455                 {
456                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
467                 }
468         }
469 }
470
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
472
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
474 {
475         mask &= thread->validate;
476         if (!mask)
477                 return;
478         if (mask & DPSOFTRAST_VALIDATE_FB)
479         {
480                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481                 DPSOFTRAST_RecalcFB(thread);
482         }
483         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
484         {
485                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486                 DPSOFTRAST_RecalcDepthFunc(thread);
487         }
488         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
489         {
490                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491                 DPSOFTRAST_RecalcBlendFunc(thread);
492         }
493 }
494
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
496 {
497         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498                 return &dpsoftrast.texture[index];
499         return NULL;
500 }
501
502 static void DPSOFTRAST_Texture_Grow(void)
503 {
504         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505         DPSOFTRAST_State_Thread *thread;
506         int i;
507         int j;
508         DPSOFTRAST_Flush();
509         // expand texture array as needed
510         if (dpsoftrast.texture_max < 1024)
511                 dpsoftrast.texture_max = 1024;
512         else
513                 dpsoftrast.texture_max *= 2;
514         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516                 if (dpsoftrast.texbound[i])
517                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518         for (j = 0; j < dpsoftrast.numthreads; j++)
519         {
520                 thread = &dpsoftrast.threads[j];
521                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522                         if (thread->texbound[i])
523                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
524         }
525 }
526
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
528 {
529         int w;
530         int h;
531         int d;
532         int size;
533         int s;
534         int texnum;
535         int mipmaps;
536         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538         DPSOFTRAST_Texture *texture;
539         if (width*height*depth < 1)
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542                 return 0;
543         }
544         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
545         {
546                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
547                 return 0;
548         }
549         switch(texformat)
550         {
551         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
554                 break;
555         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
557                 {
558                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
559                         return 0;
560                 }
561                 if (depth != 1)
562                 {
563                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564                         return 0;
565                 }
566                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
567                 {
568                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
569                         return 0;
570                 }
571                 break;
572         }
573         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576                 return 0;
577         }
578         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581                 return 0;
582         }
583         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
584         {
585                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586                 return 0;
587         }
588         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
589         {
590                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591                 return 0;
592         }
593         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
594         {
595                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596                 return 0;
597         }
598         // find first empty slot in texture array
599         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600                 if (!dpsoftrast.texture[texnum].bytes)
601                         break;
602         dpsoftrast.texture_firstfree = texnum + 1;
603         if (dpsoftrast.texture_max <= texnum)
604                 DPSOFTRAST_Texture_Grow();
605         if (dpsoftrast.texture_end <= texnum)
606                 dpsoftrast.texture_end = texnum + 1;
607         texture = &dpsoftrast.texture[texnum];
608         memset(texture, 0, sizeof(*texture));
609         texture->flags = flags;
610         texture->width = width;
611         texture->height = height;
612         texture->depth = depth;
613         texture->sides = sides;
614         texture->binds = 0;
615         w = width;
616         h = height;
617         d = depth;
618         size = 0;
619         mipmaps = 0;
620         w = width;
621         h = height;
622         d = depth;
623         for (;;)
624         {
625                 s = w * h * d * sides * 4;
626                 texture->mipmap[mipmaps][0] = size;
627                 texture->mipmap[mipmaps][1] = s;
628                 texture->mipmap[mipmaps][2] = w;
629                 texture->mipmap[mipmaps][3] = h;
630                 texture->mipmap[mipmaps][4] = d;
631                 size += s;
632                 mipmaps++;
633                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
634                         break;
635                 if (w > 1) w >>= 1;
636                 if (h > 1) h >>= 1;
637                 if (d > 1) d >>= 1;
638         }
639         texture->mipmaps = mipmaps;
640         texture->size = size;
641
642         // allocate the pixels now
643         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
644
645         return texnum;
646 }
647 void DPSOFTRAST_Texture_Free(int index)
648 {
649         DPSOFTRAST_Texture *texture;
650         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
651         if (texture->binds)
652                 DPSOFTRAST_Flush();
653         if (texture->bytes)
654                 MM_FREE(texture->bytes);
655         texture->bytes = NULL;
656         memset(texture, 0, sizeof(*texture));
657         // adjust the free range and used range
658         if (dpsoftrast.texture_firstfree > index)
659                 dpsoftrast.texture_firstfree = index;
660         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661                 dpsoftrast.texture_end--;
662 }
663 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
664 {
665         int i, x, y, z, w, layer0, layer1, row0, row1;
666         unsigned char *o, *i0, *i1, *i2, *i3;
667         DPSOFTRAST_Texture *texture;
668         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669         if (texture->mipmaps <= 1)
670                 return;
671         for (i = 1;i < texture->mipmaps;i++)
672         {
673                 for (z = 0;z < texture->mipmap[i][4];z++)
674                 {
675                         layer0 = z*2;
676                         layer1 = z*2+1;
677                         if (layer1 >= texture->mipmap[i-1][4])
678                                 layer1 = texture->mipmap[i-1][4]-1;
679                         for (y = 0;y < texture->mipmap[i][3];y++)
680                         {
681                                 row0 = y*2;
682                                 row1 = y*2+1;
683                                 if (row1 >= texture->mipmap[i-1][3])
684                                         row1 = texture->mipmap[i-1][3]-1;
685                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
686                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690                                 w = texture->mipmap[i][2];
691                                 if (layer1 > layer0)
692                                 {
693                                         if (texture->mipmap[i-1][2] > 1)
694                                         {
695                                                 // average 3D texture
696                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
697                                                 {
698                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
702                                                 }
703                                         }
704                                         else
705                                         {
706                                                 // average 3D mipmap with parent width == 1
707                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
708                                                 {
709                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
713                                                 }
714                                         }
715                                 }
716                                 else
717                                 {
718                                         if (texture->mipmap[i-1][2] > 1)
719                                         {
720                                                 // average 2D texture (common case)
721                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
722                                                 {
723                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
727                                                 }
728                                         }
729                                         else
730                                         {
731                                                 // 2D texture with parent width == 1
732                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
733                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
734                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
735                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
736                                         }
737                                 }
738                         }
739                 }
740         }
741 }
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
743 {
744         DPSOFTRAST_Texture *texture;
745         unsigned char *dst;
746         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
747         if (texture->binds)
748                 DPSOFTRAST_Flush();
749         if (pixels)
750         {
751                 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
752                 while (blockheight > 0)
753                 {
754                         dst -= texture->mipmap[0][2] * 4;
755                         memcpy(dst, pixels, blockwidth * 4);
756                         pixels += blockwidth * 4;
757                         blockheight--;
758                 }
759         }
760         DPSOFTRAST_Texture_CalculateMipmaps(index);
761 }
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
763 {
764         DPSOFTRAST_Texture *texture;
765         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
766         if (texture->binds)
767                 DPSOFTRAST_Flush();
768         if (pixels)
769         {
770                 int i, stride = texture->mipmap[0][2]*4;
771                 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
772                 for (i = texture->mipmap[0][3];i > 0;i--)
773                 {
774                         dst -= stride;
775                         memcpy(dst, pixels, stride);
776                         pixels += stride;
777                 }
778         }
779         DPSOFTRAST_Texture_CalculateMipmaps(index);
780 }
781 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
782 {
783         DPSOFTRAST_Texture *texture;
784         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785         return texture->mipmap[mip][2];
786 }
787 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
788 {
789         DPSOFTRAST_Texture *texture;
790         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
791         return texture->mipmap[mip][3];
792 }
793 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
794 {
795         DPSOFTRAST_Texture *texture;
796         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
797         return texture->mipmap[mip][4];
798 }
799 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
800 {
801         DPSOFTRAST_Texture *texture;
802         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
803         if (texture->binds)
804                 DPSOFTRAST_Flush();
805         return texture->bytes + texture->mipmap[mip][0];
806 }
807 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
808 {
809         DPSOFTRAST_Texture *texture;
810         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
811         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
812         {
813                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
814                 return;
815         }
816         if (texture->binds)
817                 DPSOFTRAST_Flush();
818         texture->filter = filter;
819 }
820
821 static void DPSOFTRAST_Draw_FlushThreads(void);
822
823 static void DPSOFTRAST_Draw_SyncCommands(void)
824 {
825         if(dpsoftrast.usethreads) MEMORY_BARRIER;
826         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
827 }
828
829 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
830 {
831         DPSOFTRAST_State_Thread *thread;
832         int i;
833         int freecommand = dpsoftrast.commandpool.freecommand;
834         int usedcommands = dpsoftrast.commandpool.usedcommands;
835         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
836                 return;
837         DPSOFTRAST_Draw_SyncCommands();
838         for(;;)
839         {
840                 int waitindex = -1;
841                 int commandoffset;
842                 usedcommands = 0;
843                 for (i = 0; i < dpsoftrast.numthreads; i++)
844                 {
845                         thread = &dpsoftrast.threads[i]; 
846                         commandoffset = freecommand - thread->commandoffset;
847                         if (commandoffset < 0)
848                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
849                         if (commandoffset > usedcommands)
850                         {
851                                 waitindex = i;
852                                 usedcommands = commandoffset;
853                         }
854                 }
855                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
856                         break;
857                 thread = &dpsoftrast.threads[waitindex];
858                 Thread_LockMutex(thread->drawmutex);
859                 if (thread->commandoffset != dpsoftrast.drawcommand)
860                 {
861                         thread->waiting = true;
862                         if (thread->starving) Thread_CondSignal(thread->drawcond);
863                         Thread_CondWait(thread->waitcond, thread->drawmutex);
864                         thread->waiting = false;
865                 }
866                 Thread_UnlockMutex(thread->drawmutex);
867         }
868         dpsoftrast.commandpool.usedcommands = usedcommands;
869 }
870
871 #define DPSOFTRAST_ALIGNCOMMAND(size) \
872         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
873 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
874         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
875
876 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
877 {
878         DPSOFTRAST_Command *command;
879         int freecommand = dpsoftrast.commandpool.freecommand;
880         int usedcommands = dpsoftrast.commandpool.usedcommands;
881         int extra = sizeof(DPSOFTRAST_Command);
882         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
883                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
884         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
885         {
886                 if (dpsoftrast.usethreads)
887                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
888                 else
889                         DPSOFTRAST_Draw_FlushThreads();
890                 freecommand = dpsoftrast.commandpool.freecommand;
891                 usedcommands = dpsoftrast.commandpool.usedcommands;
892         }
893         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
894         {
895                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
896                 command->opcode = DPSOFTRAST_OPCODE_Reset;
897                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
898                 freecommand = 0;
899         }
900         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
901         command->opcode = opcode;
902         command->commandsize = size;
903         freecommand += size;
904         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
905                 freecommand = 0;
906         dpsoftrast.commandpool.freecommand = freecommand;
907         dpsoftrast.commandpool.usedcommands = usedcommands + size;
908         return command;
909 }
910
911 static void DPSOFTRAST_UndoCommand(int size)
912 {
913         int freecommand = dpsoftrast.commandpool.freecommand;
914         int usedcommands = dpsoftrast.commandpool.usedcommands;
915         freecommand -= size;
916         if (freecommand < 0)
917                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
918         usedcommands -= size;
919         dpsoftrast.commandpool.freecommand = freecommand;
920         dpsoftrast.commandpool.usedcommands = usedcommands;
921 }
922                 
923 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
924 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
925 {
926         thread->viewport[0] = command->x;
927         thread->viewport[1] = command->y;
928         thread->viewport[2] = command->width;
929         thread->viewport[3] = command->height;
930         thread->validate |= DPSOFTRAST_VALIDATE_FB;
931 }
932 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
933 {
934         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
935         command->x = x;
936         command->y = y;
937         command->width = width;
938         command->height = height;
939
940         dpsoftrast.viewport[0] = x;
941         dpsoftrast.viewport[1] = y;
942         dpsoftrast.viewport[2] = width;
943         dpsoftrast.viewport[3] = height;
944         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
945 }
946
947 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
948 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
949 {
950         int i, x1, y1, x2, y2, w, h, x, y;
951         int miny1, maxy1, miny2, maxy2;
952         int bandy;
953         unsigned int *p;
954         unsigned int c;
955         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
956         miny1 = thread->miny1;
957         maxy1 = thread->maxy1;
958         miny2 = thread->miny2;
959         maxy2 = thread->maxy2;
960         x1 = thread->fb_scissor[0];
961         y1 = thread->fb_scissor[1];
962         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
963         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
964         if (y1 < miny1) y1 = miny1;
965         if (y2 > maxy2) y2 = maxy2;
966         w = x2 - x1;
967         h = y2 - y1;
968         if (w < 1 || h < 1)
969                 return;
970         // FIXME: honor fb_colormask?
971         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
972         for (i = 0;i < 4;i++)
973         {
974                 if (!dpsoftrast.fb_colorpixels[i])
975                         continue;
976                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
977                 for (;y < bandy;y++)
978                 {
979                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
980                         for (x = x1;x < x2;x++)
981                                 p[x] = c;
982                 }
983         }
984 }
985 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
986 {
987         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
988         command->r = r;
989         command->g = g;
990         command->b = b;
991         command->a = a;
992 }
993
994 DEFCOMMAND(3, ClearDepth, float depth;)
995 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
996 {
997         int x1, y1, x2, y2, w, h, x, y;
998         int miny1, maxy1, miny2, maxy2;
999         int bandy;
1000         unsigned int *p;
1001         unsigned int c;
1002         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1003         miny1 = thread->miny1;
1004         maxy1 = thread->maxy1;
1005         miny2 = thread->miny2;
1006         maxy2 = thread->maxy2;
1007         x1 = thread->fb_scissor[0];
1008         y1 = thread->fb_scissor[1];
1009         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1010         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1011         if (y1 < miny1) y1 = miny1;
1012         if (y2 > maxy2) y2 = maxy2;
1013         w = x2 - x1;
1014         h = y2 - y1;
1015         if (w < 1 || h < 1)
1016                 return;
1017         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1018         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1019         for (;y < bandy;y++)
1020         {
1021                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1022                 for (x = x1;x < x2;x++)
1023                         p[x] = c;
1024         }
1025 }
1026 void DPSOFTRAST_ClearDepth(float d)
1027 {
1028         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1029         command->depth = d;
1030 }
1031
1032 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1033 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1034 {
1035         thread->colormask[0] = command->r != 0;
1036         thread->colormask[1] = command->g != 0;
1037         thread->colormask[2] = command->b != 0;
1038         thread->colormask[3] = command->a != 0;
1039         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1040 }
1041 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1042 {
1043         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1044         command->r = r;
1045         command->g = g;
1046         command->b = b;
1047         command->a = a;
1048 }
1049
1050 DEFCOMMAND(5, DepthTest, int enable;)
1051 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1052 {
1053         thread->depthtest = command->enable;
1054         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1055 }
1056 void DPSOFTRAST_DepthTest(int enable)
1057 {
1058         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1059         command->enable = enable;
1060 }
1061
1062 DEFCOMMAND(6, ScissorTest, int enable;)
1063 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1064 {
1065         thread->scissortest = command->enable;
1066         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1067 }
1068 void DPSOFTRAST_ScissorTest(int enable)
1069 {
1070         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1071         command->enable = enable;
1072 }
1073
1074 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1075 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1076 {
1077         thread->scissor[0] = command->x;
1078         thread->scissor[1] = command->y;
1079         thread->scissor[2] = command->width;
1080         thread->scissor[3] = command->height;
1081         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1082 }
1083 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1084 {
1085         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1086         command->x = x;
1087         command->y = y;
1088         command->width = width;
1089         command->height = height;
1090 }
1091
1092 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1093 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1094 {
1095         thread->blendfunc[0] = command->sfactor;
1096         thread->blendfunc[1] = command->dfactor;
1097         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1098 }
1099 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1100 {
1101         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1102         command->sfactor = sfactor;
1103         command->dfactor = dfactor;
1104 }
1105
1106 DEFCOMMAND(9, BlendSubtract, int enable;)
1107 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1108 {
1109         thread->blendsubtract = command->enable;
1110         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1111 }
1112 void DPSOFTRAST_BlendSubtract(int enable)
1113 {
1114         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1115         command->enable = enable;
1116 }
1117
1118 DEFCOMMAND(10, DepthMask, int enable;)
1119 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1120 {
1121         thread->depthmask = command->enable;
1122 }
1123 void DPSOFTRAST_DepthMask(int enable)
1124 {
1125         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1126         command->enable = enable;
1127 }
1128
1129 DEFCOMMAND(11, DepthFunc, int func;)
1130 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1131 {
1132         thread->depthfunc = command->func;
1133 }
1134 void DPSOFTRAST_DepthFunc(int func)
1135 {
1136         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1137         command->func = func;
1138 }
1139
1140 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1141 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1142 {
1143         thread->depthrange[0] = command->nearval;
1144         thread->depthrange[1] = command->farval;
1145 }
1146 void DPSOFTRAST_DepthRange(float nearval, float farval)
1147 {
1148         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1149         command->nearval = nearval;
1150         command->farval = farval;
1151 }
1152
1153 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1154 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1155 {
1156         thread->polygonoffset[0] = command->alongnormal;
1157         thread->polygonoffset[1] = command->intoview;
1158 }
1159 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1160 {
1161         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1162         command->alongnormal = alongnormal;
1163         command->intoview = intoview;
1164 }
1165
1166 DEFCOMMAND(14, CullFace, int mode;)
1167 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1168 {
1169         thread->cullface = command->mode;
1170 }
1171 void DPSOFTRAST_CullFace(int mode)
1172 {
1173         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1174         command->mode = mode;
1175 }
1176
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1178 {
1179         dpsoftrast.color[0] = r;
1180         dpsoftrast.color[1] = g;
1181         dpsoftrast.color[2] = b;
1182         dpsoftrast.color[3] = a;
1183 }
1184
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1186 {
1187         int outstride = blockwidth * 4;
1188         int instride = dpsoftrast.fb_width * 4;
1189         int bx1 = blockx;
1190         int by1 = blocky;
1191         int bx2 = blockx + blockwidth;
1192         int by2 = blocky + blockheight;
1193         int bw;
1194         int x;
1195         int y;
1196         unsigned char *inpixels;
1197         unsigned char *b;
1198         unsigned char *o;
1199         DPSOFTRAST_Flush();
1200         if (bx1 < 0) bx1 = 0;
1201         if (by1 < 0) by1 = 0;
1202         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1204         bw = bx2 - bx1;
1205         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206         if (dpsoftrast.bigendian)
1207         {
1208                 for (y = by1;y < by2;y++)
1209                 {
1210                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1212                         for (x = bx1;x < bx2;x++)
1213                         {
1214                                 o[0] = b[3];
1215                                 o[1] = b[2];
1216                                 o[2] = b[1];
1217                                 o[3] = b[0];
1218                                 o += 4;
1219                                 b += 4;
1220                         }
1221                 }
1222         }
1223         else
1224         {
1225                 for (y = by1;y < by2;y++)
1226                 {
1227                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1229                         memcpy(o, b, bw*4);
1230                 }
1231         }
1232
1233 }
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1235 {
1236         int tx1 = tx;
1237         int ty1 = ty;
1238         int tx2 = tx + width;
1239         int ty2 = ty + height;
1240         int sx1 = sx;
1241         int sy1 = sy;
1242         int sx2 = sx + width;
1243         int sy2 = sy + height;
1244         int swidth;
1245         int sheight;
1246         int twidth;
1247         int theight;
1248         int sw;
1249         int sh;
1250         int tw;
1251         int th;
1252         int y;
1253         unsigned int *spixels;
1254         unsigned int *tpixels;
1255         DPSOFTRAST_Texture *texture;
1256         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257         if (mip < 0 || mip >= texture->mipmaps) return;
1258         DPSOFTRAST_Flush();
1259         spixels = dpsoftrast.fb_colorpixels[0];
1260         swidth = dpsoftrast.fb_width;
1261         sheight = dpsoftrast.fb_height;
1262         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263         twidth = texture->mipmap[mip][2];
1264         theight = texture->mipmap[mip][3];
1265         if (tx1 < 0) tx1 = 0;
1266         if (ty1 < 0) ty1 = 0;
1267         if (tx2 > twidth) tx2 = twidth;
1268         if (ty2 > theight) ty2 = theight;
1269         if (sx1 < 0) sx1 = 0;
1270         if (sy1 < 0) sy1 = 0;
1271         if (sx2 > swidth) sx2 = swidth;
1272         if (sy2 > sheight) sy2 = sheight;
1273         tw = tx2 - tx1;
1274         th = ty2 - ty1;
1275         sw = sx2 - sx1;
1276         sh = sy2 - sy1;
1277         if (tw > sw) tw = sw;
1278         if (th > sh) th = sh;
1279         if (tw < 1 || th < 1)
1280                 return;
1281         sy1 = sheight - sy1 - th;
1282         ty1 = theight - ty1 - th;
1283         for (y = 0;y < th;y++)
1284                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1285         if (texture->mipmaps > 1)
1286                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1287 }
1288
1289 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1290 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1291 {
1292         if (thread->texbound[command->unitnum])
1293                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1294         thread->texbound[command->unitnum] = command->texture;
1295 }
1296 void DPSOFTRAST_SetTexture(int unitnum, int index)
1297 {
1298         DPSOFTRAST_Command_SetTexture *command;
1299         DPSOFTRAST_Texture *texture;
1300         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1301         {
1302                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1303                 return;
1304         }
1305         texture = DPSOFTRAST_Texture_GetByIndex(index);
1306         if (index && !texture)
1307         {
1308                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1309                 return;
1310         }
1311
1312         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1313         command->unitnum = unitnum;
1314         command->texture = texture;
1315
1316         dpsoftrast.texbound[unitnum] = texture;
1317         if (texture)
1318                 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1319 }
1320
1321 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1322 {
1323         dpsoftrast.pointer_vertex3f = vertex3f;
1324         dpsoftrast.stride_vertex = stride;
1325 }
1326 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1327 {
1328         dpsoftrast.pointer_color4f = color4f;
1329         dpsoftrast.pointer_color4ub = NULL;
1330         dpsoftrast.stride_color = stride;
1331 }
1332 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1333 {
1334         dpsoftrast.pointer_color4f = NULL;
1335         dpsoftrast.pointer_color4ub = color4ub;
1336         dpsoftrast.stride_color = stride;
1337 }
1338 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1339 {
1340         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1341         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1342         dpsoftrast.stride_texcoord[unitnum] = stride;
1343 }
1344
1345 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1346 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1347 {
1348         thread->shader_mode = command->mode;
1349         thread->shader_permutation = command->permutation;
1350         thread->shader_exactspecularmath = command->exactspecularmath;
1351 }
1352 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1353 {
1354         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1355         command->mode = mode;
1356         command->permutation = permutation;
1357         command->exactspecularmath = exactspecularmath;
1358
1359         dpsoftrast.shader_mode = mode;
1360         dpsoftrast.shader_permutation = permutation;
1361         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1362 }
1363
1364 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1365 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1366 {
1367         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1368 }
1369 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1370 {
1371         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1372         command->index = index;
1373         command->val[0] = v0;
1374         command->val[1] = v1;
1375         command->val[2] = v2;
1376         command->val[3] = v3;
1377
1378         dpsoftrast.uniform4f[index*4+0] = v0;
1379         dpsoftrast.uniform4f[index*4+1] = v1;
1380         dpsoftrast.uniform4f[index*4+2] = v2;
1381         dpsoftrast.uniform4f[index*4+3] = v3;
1382 }
1383 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1384 {
1385         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1386         command->index = index;
1387         memcpy(command->val, v, sizeof(command->val));
1388
1389         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1390 }
1391
1392 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1393 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1394 {
1395         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1396 }
1397 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1398 {
1399 #ifdef SSE_POSSIBLE
1400         int i, index;
1401         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1402         {
1403                 __m128 m0, m1, m2, m3;
1404                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1405                 command->index = (DPSOFTRAST_UNIFORM)index;
1406                 if (((size_t)v)&(ALIGN_SIZE-1))
1407                 {
1408                         m0 = _mm_loadu_ps(v);
1409                         m1 = _mm_loadu_ps(v+4);
1410                         m2 = _mm_loadu_ps(v+8);
1411                         m3 = _mm_loadu_ps(v+12);
1412                 }
1413                 else
1414                 {
1415                         m0 = _mm_load_ps(v);
1416                         m1 = _mm_load_ps(v+4);
1417                         m2 = _mm_load_ps(v+8);
1418                         m3 = _mm_load_ps(v+12);
1419                 }
1420                 if (transpose)
1421                 {
1422                         __m128 t0, t1, t2, t3;
1423                         t0 = _mm_unpacklo_ps(m0, m1);
1424                         t1 = _mm_unpacklo_ps(m2, m3);
1425                         t2 = _mm_unpackhi_ps(m0, m1);
1426                         t3 = _mm_unpackhi_ps(m2, m3);
1427                         m0 = _mm_movelh_ps(t0, t1);
1428                         m1 = _mm_movehl_ps(t1, t0);
1429                         m2 = _mm_movelh_ps(t2, t3);
1430                         m3 = _mm_movehl_ps(t3, t2);                     
1431                 }
1432                 _mm_store_ps(command->val, m0);
1433                 _mm_store_ps(command->val+4, m1);
1434                 _mm_store_ps(command->val+8, m2);
1435                 _mm_store_ps(command->val+12, m3);
1436                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1437                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1438                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1439                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1440         }
1441 #endif
1442 }
1443
1444 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1445 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1446 {
1447         thread->uniform1i[command->index] = command->val;
1448 }
1449 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1450 {
1451         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1452         command->index = index;
1453         command->val = i0;
1454
1455         dpsoftrast.uniform1i[command->index] = i0;
1456 }
1457
1458 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1459 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1460 {
1461         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1462         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1463 }
1464 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1465 {
1466         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1467         command->clipplane[0] = x;
1468         command->clipplane[1] = y;
1469         command->clipplane[2] = z;
1470         command->clipplane[3] = w;
1471 }
1472
1473 #ifdef SSE_POSSIBLE
1474 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1475 {
1476         float *end = dst + size*4;
1477         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1478         {
1479                 while (dst < end)
1480                 {
1481                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1482                         dst += 4;
1483                         src += stride;
1484                 }
1485         }
1486         else
1487         {
1488                 while (dst < end)
1489                 {
1490                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1491                         dst += 4;
1492                         src += stride;
1493                 }
1494         }
1495 }
1496
1497 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1498 {
1499         float *end = dst + size*4;
1500         if (stride == sizeof(float[3]))
1501         {
1502                 float *end4 = dst + (size&~3)*4;        
1503                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1504                 {
1505                         while (dst < end4)
1506                         {
1507                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1508                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1509                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1510                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1512                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1513                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1514                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1515                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1516                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1517                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1518                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1519                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520                                 dst += 16;
1521                                 src += 4*sizeof(float[3]);
1522                         }
1523                 }
1524                 else
1525                 {
1526                         while (dst < end4)
1527                         {
1528                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1529                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1530                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1531                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1533                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1534                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1535                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1536                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1537                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1538                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1539                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1540                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1541                                 dst += 16;
1542                                 src += 4*sizeof(float[3]);
1543                         }
1544                 }
1545         }
1546         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1547         {
1548                 while (dst < end)
1549                 {
1550                         __m128 v = _mm_loadu_ps((const float *)src);
1551                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1552                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1553                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1554                         _mm_store_ps(dst, v);
1555                         dst += 4;
1556                         src += stride;
1557                 }
1558         }
1559         else
1560         {
1561                 while (dst < end)
1562                 {
1563                         __m128 v = _mm_load_ps((const float *)src);
1564                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1565                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1566                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1567                         _mm_store_ps(dst, v);
1568                         dst += 4;
1569                         src += stride;
1570                 }
1571         }
1572 }
1573
1574 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1575 {
1576         float *end = dst + size*4;
1577         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1578         if (stride == sizeof(float[2]))
1579         {
1580                 float *end2 = dst + (size&~1)*4;
1581                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1582                 {
1583                         while (dst < end2)
1584                         {
1585                                 __m128 v = _mm_loadu_ps((const float *)src);
1586                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1587                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1588                                 dst += 8;
1589                                 src += 2*sizeof(float[2]);
1590                         }
1591                 }
1592                 else
1593                 {
1594                         while (dst < end2)
1595                         {
1596                                 __m128 v = _mm_load_ps((const float *)src);
1597                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1598                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1599                                 dst += 8;
1600                                 src += 2*sizeof(float[2]);
1601                         }
1602                 }
1603         }
1604         while (dst < end)
1605         {
1606                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1607                 dst += 4;
1608                 src += stride;
1609         }
1610 }
1611
1612 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1613 {
1614         float *end = dst + size*4;
1615         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1616         if (stride == sizeof(unsigned char[4]))
1617         {
1618                 float *end4 = dst + (size&~3)*4;
1619                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1620                 {
1621                         while (dst < end4)
1622                         {
1623                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1624                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1625                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1626                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1627                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1628                                 dst += 16;
1629                                 src += 4*sizeof(unsigned char[4]);
1630                         }
1631                 }
1632                 else
1633                 {
1634                         while (dst < end4)
1635                         {
1636                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1637                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1638                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1639                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1640                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1641                                 dst += 16;
1642                                 src += 4*sizeof(unsigned char[4]);
1643                         }
1644                 }
1645         }
1646         while (dst < end)
1647         {
1648                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1649                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1650                 dst += 4;
1651                 src += stride;
1652         }
1653 }
1654
1655 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1656 {
1657         float *end = dst + 4*size;
1658         __m128 v = _mm_loadu_ps(src);
1659         while (dst < end)
1660         {
1661                 _mm_store_ps(dst, v);
1662                 dst += 4;
1663         }
1664 }
1665 #endif
1666
1667 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1668 {
1669 #ifdef SSE_POSSIBLE
1670         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1671         __m128 m0, m1, m2, m3;
1672         float *end;
1673         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1674         {
1675                 // fast case for identity matrix
1676                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1677                 return;
1678         }
1679         end = out4f + numitems*4;
1680         m0 = _mm_loadu_ps(inmatrix16f);
1681         m1 = _mm_loadu_ps(inmatrix16f + 4);
1682         m2 = _mm_loadu_ps(inmatrix16f + 8);
1683         m3 = _mm_loadu_ps(inmatrix16f + 12);
1684         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1685         {
1686                 while (out4f < end)
1687                 {
1688                         __m128 v = _mm_loadu_ps(in4f);
1689                         _mm_store_ps(out4f,
1690                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1691                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1692                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1693                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1694                         out4f += 4;
1695                         in4f += 4;
1696                 }
1697         }
1698         else
1699         {
1700                 while (out4f < end)
1701                 {
1702                         __m128 v = _mm_load_ps(in4f);
1703                         _mm_store_ps(out4f,
1704                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1705                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1706                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1707                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1708                         out4f += 4;
1709                         in4f += 4;
1710                 }
1711         }
1712 #endif
1713 }
1714
1715 #if 0
1716 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1717 {
1718         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1719 }
1720 #endif
1721
1722 #ifdef SSE_POSSIBLE
1723 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1724 { \
1725         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1726         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1727         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1728         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1729 }
1730
1731 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1732 { \
1733         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1734         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1735         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1736         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1737 }
1738
1739 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1740 { \
1741         __m128 p = (in); \
1742         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1743                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1744                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1745                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1746 }
1747
1748 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1749 {
1750         int clipmask = 0xFF;
1751         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1752         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1753         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1754         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1755         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1756         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1757         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1758         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1759         #define BBFRONT(k, pos) \
1760         { \
1761                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1762                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1763                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1764                 { \
1765                         __m128 proj; \
1766                         clipmask &= ~(1<<k); \
1767                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1768                         minproj = _mm_min_ss(minproj, proj); \
1769                         maxproj = _mm_max_ss(maxproj, proj); \
1770                 } \
1771         }
1772         BBFRONT(0, minpos); 
1773         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1774         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1775         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1776         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1777         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1778         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1779         BBFRONT(7, maxpos);
1780         #define BBCLIP(k) \
1781         { \
1782                 if (clipmask&(1<<k)) \
1783                 { \
1784                         if (!(clipmask&(1<<(k^1)))) \
1785                         { \
1786                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1787                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1788                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1789                                 minproj = _mm_min_ss(minproj, proj); \
1790                                 maxproj = _mm_max_ss(maxproj, proj); \
1791                         } \
1792                         if (!(clipmask&(1<<(k^2)))) \
1793                         { \
1794                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1795                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1796                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1797                                 minproj = _mm_min_ss(minproj, proj); \
1798                                 maxproj = _mm_max_ss(maxproj, proj); \
1799                         } \
1800                         if (!(clipmask&(1<<(k^4)))) \
1801                         { \
1802                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1803                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1804                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1805                                 minproj = _mm_min_ss(minproj, proj); \
1806                                 maxproj = _mm_max_ss(maxproj, proj); \
1807                         } \
1808                 } \
1809         }
1810         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1811         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1812         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1813         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1814         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1815         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1816         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1817         *starty = _mm_cvttss_si32(maxproj);
1818         *endy = _mm_cvttss_si32(minproj)+1;
1819         return clipmask;
1820 }
1821         
1822 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1823 {
1824         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1825         float *end = out4f + numitems*4;
1826         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1827         __m128 minpos, maxpos;
1828         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1829         {
1830                 minpos = maxpos = _mm_loadu_ps(in4f);
1831                 while (out4f < end)
1832                 {
1833                         __m128 v = _mm_loadu_ps(in4f);
1834                         minpos = _mm_min_ps(minpos, v);
1835                         maxpos = _mm_max_ps(maxpos, v);
1836                         _mm_store_ps(out4f, v);
1837                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1838                         _mm_store_ps(screen4f, v);
1839                         in4f += 4;
1840                         out4f += 4;
1841                         screen4f += 4;
1842                 }
1843         }
1844         else
1845         {
1846                 minpos = maxpos = _mm_load_ps(in4f);
1847                 while (out4f < end)
1848                 {
1849                         __m128 v = _mm_load_ps(in4f);
1850                         minpos = _mm_min_ps(minpos, v);
1851                         maxpos = _mm_max_ps(maxpos, v);
1852                         _mm_store_ps(out4f, v);
1853                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1854                         _mm_store_ps(screen4f, v);
1855                         in4f += 4;
1856                         out4f += 4;
1857                         screen4f += 4;
1858                 }
1859         }
1860         if (starty && endy) 
1861         {
1862                 ALIGN(float minposf[4]);
1863                 ALIGN(float maxposf[4]);
1864                 _mm_store_ps(minposf, minpos);
1865                 _mm_store_ps(maxposf, maxpos);
1866                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1867         }
1868         return 0;
1869 }
1870
1871 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1872 {
1873         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1874         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1875         float *end;
1876         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1877                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1878         end = out4f + numitems*4;
1879         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1880         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1881         m0 = _mm_loadu_ps(inmatrix16f);
1882         m1 = _mm_loadu_ps(inmatrix16f + 4);
1883         m2 = _mm_loadu_ps(inmatrix16f + 8);
1884         m3 = _mm_loadu_ps(inmatrix16f + 12);
1885         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1886         {
1887                 minpos = maxpos = _mm_loadu_ps(in4f);
1888                 while (out4f < end)
1889                 {
1890                         __m128 v = _mm_loadu_ps(in4f);
1891                         minpos = _mm_min_ps(minpos, v);
1892                         maxpos = _mm_max_ps(maxpos, v);
1893                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894                         _mm_store_ps(out4f, v);
1895                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896                         _mm_store_ps(screen4f, v);
1897                         in4f += 4;
1898                         out4f += 4;
1899                         screen4f += 4;
1900                 }
1901         }
1902         else
1903         {
1904                 minpos = maxpos = _mm_load_ps(in4f);
1905                 while (out4f < end)
1906                 {
1907                         __m128 v = _mm_load_ps(in4f);
1908                         minpos = _mm_min_ps(minpos, v);
1909                         maxpos = _mm_max_ps(maxpos, v);
1910                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1911                         _mm_store_ps(out4f, v);
1912                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1913                         _mm_store_ps(screen4f, v);
1914                         in4f += 4;
1915                         out4f += 4;
1916                         screen4f += 4;
1917                 }
1918         }
1919         if (starty && endy) 
1920         {
1921                 ALIGN(float minposf[4]);
1922                 ALIGN(float maxposf[4]);
1923                 _mm_store_ps(minposf, minpos);
1924                 _mm_store_ps(maxposf, maxpos);
1925                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1926         }
1927         return 0;
1928 }
1929 #endif
1930
1931 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1932 {
1933 #ifdef SSE_POSSIBLE
1934         float *outf = dpsoftrast.post_array4f[outarray];
1935         const unsigned char *inb;
1936         int firstvertex = dpsoftrast.firstvertex;
1937         int numvertices = dpsoftrast.numvertices;
1938         int stride;
1939         switch(inarray)
1940         {
1941         case DPSOFTRAST_ARRAY_POSITION:
1942                 stride = dpsoftrast.stride_vertex;
1943                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1944                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1945                 break;
1946         case DPSOFTRAST_ARRAY_COLOR:
1947                 stride = dpsoftrast.stride_color;
1948                 if (dpsoftrast.pointer_color4f)
1949                 {
1950                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1951                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1952                 }
1953                 else if (dpsoftrast.pointer_color4ub)
1954                 {
1955                         stride = dpsoftrast.stride_color;
1956                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1957                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1958                 }
1959                 else
1960                 {
1961                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1962                 }
1963                 break;
1964         default:
1965                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1966                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1967                 {
1968                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1969                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1970                         {
1971                         case 2:
1972                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1973                                 break;
1974                         case 3:
1975                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1976                                 break;
1977                         case 4:
1978                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1979                                 break;
1980                         }
1981                 }
1982                 break;
1983         }
1984         return outf;
1985 #else
1986         return NULL;
1987 #endif
1988 }
1989
1990 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1991 {
1992         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1993         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1994         return data;
1995 }
1996
1997 #if 0
1998 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1999 {
2000 #ifdef SSE_POSSIBLE
2001         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2003         return data;
2004 #else
2005         return NULL;
2006 #endif
2007 }
2008 #endif
2009
2010 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2011 {
2012 #ifdef SSE_POSSIBLE
2013         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2014         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2015         return data;
2016 #else
2017         return NULL;
2018 #endif
2019 }
2020
2021 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2022 {
2023         int x;
2024         int startx = span->startx;
2025         int endx = span->endx;
2026         float wslope = triangle->w[0];
2027         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2028         float endz = 1.0f / (w + wslope * startx);
2029         if (triangle->w[0] == 0)
2030         {
2031                 // LordHavoc: fast flat polygons (HUD/menu)
2032                 for (x = startx;x < endx;x++)
2033                         zf[x] = endz;
2034                 return;
2035         }
2036         for (x = startx;x < endx;)
2037         {
2038                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2039                 float z = endz, dz;
2040                 if (nextsub >= endx) nextsub = endsub = endx-1;
2041                 endz = 1.0f / (w + wslope * nextsub);
2042                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2043                 for (; x <= endsub; x++, z += dz)
2044                         zf[x] = z;
2045         }
2046 }
2047
2048 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2049 {
2050 #ifdef SSE_POSSIBLE
2051         int x;
2052         int startx = span->startx;
2053         int endx = span->endx;
2054         int maskx;
2055         int subx;
2056         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2057         unsigned char * RESTRICT pixelmask = span->pixelmask;
2058         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2059         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2060         if (!pixel)
2061                 return;
2062         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2063         pixeli += span->y * dpsoftrast.fb_width + span->x;
2064         // handle alphatest now (this affects depth writes too)
2065         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2066                 for (x = startx;x < endx;x++)
2067                         if (in4ub[x*4+3] < 128)
2068                                 pixelmask[x] = false;
2069         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2070         // helps sprites, text and hud artwork
2071         switch(thread->fb_blendmode)
2072         {
2073         case DPSOFTRAST_BLENDMODE_ALPHA:
2074         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2075         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2076                 maskx = startx;
2077                 for (x = startx;x < endx;x++)
2078                 {
2079                         if (in4ub[x*4+3] >= 1)
2080                         {
2081                                 startx = x;
2082                                 for (;;)
2083                                 {
2084                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2085                                         maskx = x;
2086                                         if (x >= endx) break;
2087                                         ++x;
2088                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2089                                         if (x >= endx) break;
2090                                 }
2091                                 break;
2092                         }
2093                 }
2094                 endx = maskx;
2095                 break;
2096         case DPSOFTRAST_BLENDMODE_OPAQUE:
2097         case DPSOFTRAST_BLENDMODE_ADD:
2098         case DPSOFTRAST_BLENDMODE_INVMOD:
2099         case DPSOFTRAST_BLENDMODE_MUL:
2100         case DPSOFTRAST_BLENDMODE_MUL2:
2101         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2102         case DPSOFTRAST_BLENDMODE_INVADD:
2103                 break;
2104         }
2105         // put some special values at the end of the mask to ensure the loops end
2106         pixelmask[endx] = 1;
2107         pixelmask[endx+1] = 0;
2108         // LordHavoc: use a double loop to identify subspans, this helps the
2109         // optimized copy/blend loops to perform at their best, most triangles
2110         // have only one run of pixels, and do the search using wide reads...
2111         x = startx;
2112         while (x < endx)
2113         {
2114                 // if this pixel is masked off, it's probably not alone...
2115                 if (!pixelmask[x])
2116                 {
2117                         x++;
2118 #if 1
2119                         if (x + 8 < endx)
2120                         {
2121                                 // the 4-item search must be aligned or else it stalls badly
2122                                 if ((x & 3) && !pixelmask[x]) 
2123                                 {
2124                                         if(pixelmask[x]) goto endmasked;
2125                                         x++;
2126                                         if (x & 3)
2127                                         {
2128                                                 if(pixelmask[x]) goto endmasked;
2129                                                 x++;
2130                                                 if (x & 3)
2131                                                 {
2132                                                         if(pixelmask[x]) goto endmasked;
2133                                                         x++;
2134                                                 }
2135                                         }
2136                                 }
2137                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2138                                         x += 4;
2139                         }
2140 #endif
2141                         for (;!pixelmask[x];x++)
2142                                 ;
2143                         // rather than continue the loop, just check the end variable
2144                         if (x >= endx)
2145                                 break;
2146                 }
2147         endmasked:
2148                 // find length of subspan
2149                 subx = x + 1;
2150 #if 1
2151                 if (subx + 8 < endx)
2152                 {
2153                         if (subx & 3)
2154                         {
2155                                 if(!pixelmask[subx]) goto endunmasked;
2156                                 subx++;
2157                                 if (subx & 3)
2158                                 {
2159                                         if(!pixelmask[subx]) goto endunmasked;
2160                                         subx++;
2161                                         if (subx & 3)
2162                                         {
2163                                                 if(!pixelmask[subx]) goto endunmasked;
2164                                                 subx++;
2165                                         }
2166                                 }
2167                         }
2168                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2169                                 subx += 4;
2170                 }
2171 #endif
2172                 for (;pixelmask[subx];subx++)
2173                         ;
2174                 // the checks can overshoot, so make sure to clip it...
2175                 if (subx > endx)
2176                         subx = endx;
2177         endunmasked:
2178                 // now that we know the subspan length...  process!
2179                 switch(thread->fb_blendmode)
2180                 {
2181                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2182 #if 0
2183                         if (subx - x >= 16)
2184                         {
2185                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2186                                 x = subx;
2187                         }
2188                         else
2189 #elif 1
2190                         while (x + 16 <= subx)
2191                         {
2192                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2193                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2194                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2195                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2196                                 x += 16;
2197                         }
2198 #endif
2199                         {
2200                                 while (x + 4 <= subx)
2201                                 {
2202                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2203                                         x += 4;
2204                                 }
2205                                 if (x + 2 <= subx)
2206                                 {
2207                                         pixeli[x] = ini[x];
2208                                         pixeli[x+1] = ini[x+1];
2209                                         x += 2;
2210                                 }
2211                                 if (x < subx)
2212                                 {
2213                                         pixeli[x] = ini[x];
2214                                         x++;
2215                                 }
2216                         }
2217                         break;
2218                 case DPSOFTRAST_BLENDMODE_ALPHA:
2219                 #define FINISHBLEND(blend2, blend1) \
2220                         for (;x + 1 < subx;x += 2) \
2221                         { \
2222                                 __m128i src, dst; \
2223                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2224                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2225                                 blend2; \
2226                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2227                         } \
2228                         if (x < subx) \
2229                         { \
2230                                 __m128i src, dst; \
2231                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2232                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2233                                 blend1; \
2234                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2235                                 x++; \
2236                         }
2237                         FINISHBLEND({
2238                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2239                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2240                         }, {
2241                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2242                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2243                         });
2244                         break;
2245                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2246                         FINISHBLEND({
2247                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2248                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2249                         }, {
2250                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2251                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2252                         });
2253                         break;
2254                 case DPSOFTRAST_BLENDMODE_ADD:
2255                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2256                         break;
2257                 case DPSOFTRAST_BLENDMODE_INVMOD:
2258                         FINISHBLEND({
2259                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2260                         }, {
2261                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2262                         });
2263                         break;
2264                 case DPSOFTRAST_BLENDMODE_MUL:
2265                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2266                         break;
2267                 case DPSOFTRAST_BLENDMODE_MUL2:
2268                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2269                         break;
2270                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2271                         FINISHBLEND({
2272                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2273                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2274                         }, {
2275                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2276                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2277                         });
2278                         break;
2279                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2280                         FINISHBLEND({
2281                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2282                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2283                         }, {
2284                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2285                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2286                         });
2287                         break;
2288                 case DPSOFTRAST_BLENDMODE_INVADD:
2289                         FINISHBLEND({
2290                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2291                         }, {
2292                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2293                         });
2294                         break;
2295                 }
2296         }
2297 #endif
2298 }
2299
2300 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2301         // warning: this is SLOW, only use if the optimized per-span functions won't do
2302 {
2303         const unsigned char * RESTRICT pixelbase;
2304         const unsigned char * RESTRICT pixel[4];
2305         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2306         int wrapmask[2] = { width-1, height-1 };
2307         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2308         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2309         {
2310                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2311                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2312                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2313                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2314                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2315                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2316                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2317                 {
2318                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2319                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2320                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2321                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2322                 }
2323                 else
2324                 {
2325                         tci[0] &= wrapmask[0];
2326                         tci[1] &= wrapmask[1];
2327                         tci1[0] &= wrapmask[0];
2328                         tci1[1] &= wrapmask[1];
2329                 }
2330                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2331                 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2332                 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2333                 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2334                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2335                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2336                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2337                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2338         }
2339         else
2340         {
2341                 int tci[2] = { x * width, y * height };
2342                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2343                 {
2344                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2345                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2346                 }
2347                 else
2348                 {
2349                         tci[0] &= wrapmask[0];
2350                         tci[1] &= wrapmask[1];
2351                 }
2352                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2353                 c[0] = pixel[0][0];
2354                 c[1] = pixel[0][1];
2355                 c[2] = pixel[0][2];
2356                 c[3] = pixel[0][3];
2357         }
2358 }
2359
2360 #if 0
2361 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2362 {
2363         int x;
2364         int startx = span->startx;
2365         int endx = span->endx;
2366         int flags;
2367         float c[4];
2368         float data[4];
2369         float slope[4];
2370         float tc[2], endtc[2];
2371         float tcscale[2];
2372         unsigned int tci[2];
2373         unsigned int tci1[2];
2374         unsigned int tcimin[2];
2375         unsigned int tcimax[2];
2376         int tciwrapmask[2];
2377         int tciwidth;
2378         int filter;
2379         int mip;
2380         const unsigned char * RESTRICT pixelbase;
2381         const unsigned char * RESTRICT pixel[4];
2382         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2383         // if no texture is bound, just fill it with white
2384         if (!texture)
2385         {
2386                 for (x = startx;x < endx;x++)
2387                 {
2388                         out4f[x*4+0] = 1.0f;
2389                         out4f[x*4+1] = 1.0f;
2390                         out4f[x*4+2] = 1.0f;
2391                         out4f[x*4+3] = 1.0f;
2392                 }
2393                 return;
2394         }
2395         mip = triangle->mip[texunitindex];
2396         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2397         // if this mipmap of the texture is 1 pixel, just fill it with that color
2398         if (texture->mipmap[mip][1] == 4)
2399         {
2400                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2401                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2402                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2403                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2404                 for (x = startx;x < endx;x++)
2405                 {
2406                         out4f[x*4+0] = c[0];
2407                         out4f[x*4+1] = c[1];
2408                         out4f[x*4+2] = c[2];
2409                         out4f[x*4+3] = c[3];
2410                 }
2411                 return;
2412         }
2413         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2414         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2415         flags = texture->flags;
2416         tcscale[0] = texture->mipmap[mip][2];
2417         tcscale[1] = texture->mipmap[mip][3];
2418         tciwidth = -texture->mipmap[mip][2];
2419         tcimin[0] = 0;
2420         tcimin[1] = 0;
2421         tcimax[0] = texture->mipmap[mip][2]-1;
2422         tcimax[1] = texture->mipmap[mip][3]-1;
2423         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2424         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2425         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2426         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2427         if (filter)
2428         {
2429                 endtc[0] -= 0.5f;
2430                 endtc[1] -= 0.5f;
2431         }
2432         for (x = startx;x < endx;)
2433         {
2434                 unsigned int subtc[2];
2435                 unsigned int substep[2];
2436                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2437                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2438                 if (nextsub >= endx)
2439                 {
2440                         nextsub = endsub = endx-1;      
2441                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2442                 }
2443                 tc[0] = endtc[0];
2444                 tc[1] = endtc[1];
2445                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2446                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2447                 if (filter)
2448                 {
2449                         endtc[0] -= 0.5f;
2450                         endtc[1] -= 0.5f;
2451                 }
2452                 substep[0] = (endtc[0] - tc[0]) * subscale;
2453                 substep[1] = (endtc[1] - tc[1]) * subscale;
2454                 subtc[0] = tc[0] * (1<<12);
2455                 subtc[1] = tc[1] * (1<<12);
2456                 if (filter)
2457                 {
2458                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2459                         {
2460                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2461                                 {
2462                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2463                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2464                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2465                                         tci[0] = subtc[0]>>12;
2466                                         tci[1] = subtc[1]>>12;
2467                                         tci1[0] = tci[0] + 1;
2468                                         tci1[1] = tci[1] + 1;
2469                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2470                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2471                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2472                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2473                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2474                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2475                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2476                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2477                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2478                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2479                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2480                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2481                                         out4f[x*4+0] = c[0];
2482                                         out4f[x*4+1] = c[1];
2483                                         out4f[x*4+2] = c[2];
2484                                         out4f[x*4+3] = c[3];
2485                                 }
2486                         }
2487                         else
2488                         {
2489                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2490                                 {
2491                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2492                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2493                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2494                                         tci[0] = subtc[0]>>12;
2495                                         tci[1] = subtc[1]>>12;
2496                                         tci1[0] = tci[0] + 1;
2497                                         tci1[1] = tci[1] + 1;
2498                                         tci[0] &= tciwrapmask[0];
2499                                         tci[1] &= tciwrapmask[1];
2500                                         tci1[0] &= tciwrapmask[0];
2501                                         tci1[1] &= tciwrapmask[1];
2502                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2503                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2504                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2505                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2506                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2507                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2508                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2509                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2510                                         out4f[x*4+0] = c[0];
2511                                         out4f[x*4+1] = c[1];
2512                                         out4f[x*4+2] = c[2];
2513                                         out4f[x*4+3] = c[3];
2514                                 }
2515                         }
2516                 }
2517                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2518                 {
2519                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2520                         {
2521                                 tci[0] = subtc[0]>>12;
2522                                 tci[1] = subtc[1]>>12;
2523                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2524                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2525                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2526                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2527                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2528                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2529                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2530                                 out4f[x*4+0] = c[0];
2531                                 out4f[x*4+1] = c[1];
2532                                 out4f[x*4+2] = c[2];
2533                                 out4f[x*4+3] = c[3];
2534                         }
2535                 }
2536                 else
2537                 {
2538                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2539                         {
2540                                 tci[0] = subtc[0]>>12;
2541                                 tci[1] = subtc[1]>>12;
2542                                 tci[0] &= tciwrapmask[0];
2543                                 tci[1] &= tciwrapmask[1];
2544                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2545                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2546                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2547                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2548                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2549                                 out4f[x*4+0] = c[0];
2550                                 out4f[x*4+1] = c[1];
2551                                 out4f[x*4+2] = c[2];
2552                                 out4f[x*4+3] = c[3];
2553                         }
2554                 }
2555         }
2556 }
2557 #endif
2558
2559 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2560 {
2561 #ifdef SSE_POSSIBLE
2562         int x;
2563         int startx = span->startx;
2564         int endx = span->endx;
2565         int flags;
2566         __m128 data, slope, tcscale;
2567         __m128i tcsize, tcmask, tcoffset, tcmax;
2568         __m128 tc, endtc;
2569         __m128i subtc, substep, endsubtc;
2570         int filter;
2571         int mip;
2572         int affine; // LordHavoc: optimized affine texturing case
2573         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2574         const unsigned char * RESTRICT pixelbase;
2575         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2576         // if no texture is bound, just fill it with white
2577         if (!texture)
2578         {
2579                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2580                 return;
2581         }
2582         mip = triangle->mip[texunitindex];
2583         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2584         // if this mipmap of the texture is 1 pixel, just fill it with that color
2585         if (texture->mipmap[mip][1] == 4)
2586         {
2587                 unsigned int k = *((const unsigned int *)pixelbase);
2588                 for (x = startx;x < endx;x++)
2589                         outi[x] = k;
2590                 return;
2591         }
2592         affine = zf[startx] == zf[endx-1];
2593         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2594         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2595         flags = texture->flags;
2596         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2597         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2598         tcscale = _mm_cvtepi32_ps(tcsize);
2599         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2600         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2601         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2602         if (filter)
2603                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2604         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2605         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2606         tcmax = _mm_packs_epi32(tcmask, tcmask);
2607         for (x = startx;x < endx;)
2608         {
2609                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2610                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2611                 if (nextsub >= endx || affine)
2612                 {
2613                         nextsub = endsub = endx-1;
2614                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2615                 }       
2616                 tc = endtc;
2617                 subtc = endsubtc;
2618                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2619                 if (filter)
2620                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2621                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2622                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2623                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2624                 substep = _mm_slli_epi32(substep, 1);
2625                 if (filter)
2626                 {
2627                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2628                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2629                         {
2630                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2631                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2632                                 {
2633                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2634                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2635                                         tci = _mm_madd_epi16(tci, tcoffset);
2636                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2637                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2638                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2639                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2640                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2641                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2642                                         fracm = _mm_srli_epi16(subtc, 1);
2643                                         pix1 = _mm_add_epi16(pix1,
2644                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2645                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2646                                         pix3 = _mm_add_epi16(pix3,
2647                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2648                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2649                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2650                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2651                                         pix2 = _mm_add_epi16(pix2,
2652                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2653                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2654                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2655                                 }
2656                                 if (x <= endsub)
2657                                 {
2658                                         const unsigned char * RESTRICT ptr1;
2659                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2660                                         tci = _mm_madd_epi16(tci, tcoffset);
2661                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2662                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2663                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2664                                         fracm = _mm_srli_epi16(subtc, 1);
2665                                         pix1 = _mm_add_epi16(pix1,
2666                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2667                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2668                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2669                                         pix1 = _mm_add_epi16(pix1,
2670                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2671                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2672                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2673                                         x++;
2674                                 }
2675                         }
2676                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2677                         {
2678                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2679                                 {
2680                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2681                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2682                                         tci = _mm_madd_epi16(tci, tcoffset);
2683                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2684                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2685                                                                                         _mm_setzero_si128());
2686                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2687                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2688                                                                                         _mm_setzero_si128());
2689                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2690                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2691                                         tci = _mm_madd_epi16(tci, tcoffset);
2692                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2693                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2694                                                                                         _mm_setzero_si128());
2695                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2696                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2697                                                                                         _mm_setzero_si128());
2698                                         fracm = _mm_srli_epi16(subtc, 1);
2699                                         pix1 = _mm_add_epi16(pix1,
2700                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2701                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2702                                         pix3 = _mm_add_epi16(pix3,
2703                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2704                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2705                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2706                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2707                                         pix2 = _mm_add_epi16(pix2,
2708                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2709                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2710                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2711                                 }
2712                                 if (x <= endsub)
2713                                 {
2714                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2715                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2716                                         tci = _mm_madd_epi16(tci, tcoffset);
2717                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2718                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2719                                                                                         _mm_setzero_si128());
2720                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2721                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2722                                                                                         _mm_setzero_si128());
2723                                         fracm = _mm_srli_epi16(subtc, 1);
2724                                         pix1 = _mm_add_epi16(pix1,
2725                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2726                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2727                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2728                                         pix1 = _mm_add_epi16(pix1,
2729                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2730                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2731                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2732                                         x++;
2733                                 }
2734                         }
2735                         else
2736                         {
2737                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2738                                 {
2739                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2740                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2741                                         tci = _mm_madd_epi16(tci, tcoffset);
2742                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2743                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2744                                                                                         _mm_setzero_si128());
2745                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2746                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2747                                                                                         _mm_setzero_si128());
2748                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2749                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2750                                         tci = _mm_madd_epi16(tci, tcoffset);
2751                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2752                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2753                                                                                         _mm_setzero_si128());
2754                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2755                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2756                                                                                         _mm_setzero_si128());
2757                                         fracm = _mm_srli_epi16(subtc, 1);
2758                                         pix1 = _mm_add_epi16(pix1,
2759                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2760                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2761                                         pix3 = _mm_add_epi16(pix3,
2762                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2763                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2764                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2765                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2766                                         pix2 = _mm_add_epi16(pix2,
2767                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2768                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2769                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2770                                 }
2771                                 if (x <= endsub)
2772                                 {
2773                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2774                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2775                                         tci = _mm_madd_epi16(tci, tcoffset);
2776                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2777                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2778                                                                                         _mm_setzero_si128());
2779                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2780                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2781                                                                                         _mm_setzero_si128());
2782                                         fracm = _mm_srli_epi16(subtc, 1);
2783                                         pix1 = _mm_add_epi16(pix1,
2784                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2785                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2786                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2787                                         pix1 = _mm_add_epi16(pix1,
2788                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2789                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2790                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2791                                         x++;
2792                                 }
2793                         }
2794                 }
2795                 else
2796                 {
2797                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2798                         {
2799                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2800                                 {
2801                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2802                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2803                                         tci = _mm_madd_epi16(tci, tcoffset);
2804                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2805                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2806                                 }
2807                                 if (x <= endsub)
2808                                 {
2809                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2810                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2811                                         tci = _mm_madd_epi16(tci, tcoffset);
2812                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2813                                         x++;
2814                                 }
2815                         }
2816                         else
2817                         {
2818                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2819                                 {
2820                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2821                                         tci = _mm_and_si128(tci, tcmax); 
2822                                         tci = _mm_madd_epi16(tci, tcoffset);
2823                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2824                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2825                                 }
2826                                 if (x <= endsub)
2827                                 {
2828                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2829                                         tci = _mm_and_si128(tci, tcmax); 
2830                                         tci = _mm_madd_epi16(tci, tcoffset);
2831                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2832                                         x++;
2833                                 }
2834                         }
2835                 }
2836         }
2837 #endif
2838 }
2839
2840 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2841 {
2842         // TODO: IMPLEMENT
2843         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2844 }
2845
2846 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2847 {
2848         // TODO: IMPLEMENT
2849         return 1.0f;
2850 }
2851
2852 #if 0
2853 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2854 {
2855         int x;
2856         int startx = span->startx;
2857         int endx = span->endx;
2858         float c[4];
2859         float data[4];
2860         float slope[4];
2861         float z;
2862         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2863         for (x = startx;x < endx;x++)
2864         {
2865                 z = zf[x];
2866                 c[0] = (data[0] + slope[0]*x) * z;
2867                 c[1] = (data[1] + slope[1]*x) * z;
2868                 c[2] = (data[2] + slope[2]*x) * z;
2869                 c[3] = (data[3] + slope[3]*x) * z;
2870                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2871                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2872                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2873                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2874         }
2875 }
2876 #endif
2877
2878 #if 0
2879 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2880 {
2881         int x;
2882         int startx = span->startx;
2883         int endx = span->endx;
2884         float c[4];
2885         float data[4];
2886         float slope[4];
2887         float z;
2888         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2889         for (x = startx;x < endx;x++)
2890         {
2891                 z = zf[x];
2892                 c[0] = (data[0] + slope[0]*x) * z;
2893                 c[1] = (data[1] + slope[1]*x) * z;
2894                 c[2] = (data[2] + slope[2]*x) * z;
2895                 c[3] = (data[3] + slope[3]*x) * z;
2896                 out4f[x*4+0] = c[0];
2897                 out4f[x*4+1] = c[1];
2898                 out4f[x*4+2] = c[2];
2899                 out4f[x*4+3] = c[3];
2900         }
2901 }
2902 #endif
2903
2904 #if 0
2905 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2906 {
2907         int x, startx = span->startx, endx = span->endx;
2908         float c[4], localcolor[4];
2909         localcolor[0] = subcolor[0];
2910         localcolor[1] = subcolor[1];
2911         localcolor[2] = subcolor[2];
2912         localcolor[3] = subcolor[3];
2913         for (x = startx;x < endx;x++)
2914         {
2915                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2916                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2917                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2918                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2919                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2920                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2921                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2922                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2923         }
2924 }
2925 #endif
2926
2927 #if 0
2928 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2929 {
2930         int x, startx = span->startx, endx = span->endx;
2931         for (x = startx;x < endx;x++)
2932         {
2933                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2934                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2935                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2936                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2937         }
2938 }
2939 #endif
2940
2941 #if 0
2942 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2943 {
2944         int x, startx = span->startx, endx = span->endx;
2945         for (x = startx;x < endx;x++)
2946         {
2947                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2948                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2949                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2950                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2951         }
2952 }
2953 #endif
2954
2955 #if 0
2956 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2957 {
2958         int x, startx = span->startx, endx = span->endx;
2959         float a, b;
2960         for (x = startx;x < endx;x++)
2961         {
2962                 a = 1.0f - inb4f[x*4+3];
2963                 b = inb4f[x*4+3];
2964                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2965                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2966                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2967                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2968         }
2969 }
2970 #endif
2971
2972 #if 0
2973 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2974 {
2975         int x, startx = span->startx, endx = span->endx;
2976         float localcolor[4], ilerp, lerp;
2977         localcolor[0] = color[0];
2978         localcolor[1] = color[1];
2979         localcolor[2] = color[2];
2980         localcolor[3] = color[3];
2981         ilerp = 1.0f - localcolor[3];
2982         lerp = localcolor[3];
2983         for (x = startx;x < endx;x++)
2984         {
2985                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2986                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2987                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2988                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2989         }
2990 }
2991 #endif
2992
2993
2994
2995 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2996 {
2997 #ifdef SSE_POSSIBLE
2998         int x;
2999         int startx = span->startx;
3000         int endx = span->endx;
3001         __m128 data, slope;
3002         __m128 mod, endmod;
3003         __m128i submod, substep, endsubmod;
3004         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3005         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3006         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3007         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3008         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3009         for (x = startx; x < endx;)
3010         {
3011                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3012                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3013                 if (nextsub >= endx)
3014                 {
3015                         nextsub = endsub = endx-1;
3016                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3017                 }
3018                 mod = endmod;
3019                 submod = endsubmod;
3020                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3021                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3022                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3023                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3024                 substep = _mm_packs_epi32(substep, substep);
3025                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3026                 {
3027                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3028                         pix = _mm_mulhi_epu16(pix, submod);
3029                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3030                 }
3031                 if (x <= endsub)
3032                 {
3033                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3034                         pix = _mm_mulhi_epu16(pix, submod);
3035                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3036                         x++;
3037                 }
3038         }
3039 #endif
3040 }
3041
3042 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3043 {
3044 #ifdef SSE_POSSIBLE
3045         int x;
3046         int startx = span->startx;
3047         int endx = span->endx;
3048         __m128 data, slope;
3049         __m128 mod, endmod;
3050         __m128i submod, substep, endsubmod;
3051         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3052         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3053         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3054         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3055         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3056         for (x = startx; x < endx;)
3057         {
3058                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3059                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3060                 if (nextsub >= endx)
3061                 {
3062                         nextsub = endsub = endx-1;
3063                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3064                 }
3065                 mod = endmod;
3066                 submod = endsubmod;
3067                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3068                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3069                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3070                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3071                 substep = _mm_packs_epi32(substep, substep);
3072                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3073                 {
3074                         __m128i pix = _mm_srai_epi16(submod, 4);
3075                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3076                 }
3077                 if (x <= endsub)
3078                 {
3079                         __m128i pix = _mm_srai_epi16(submod, 4);
3080                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3081                         x++;
3082                 }
3083         }
3084 #endif
3085 }
3086
3087 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3088 {
3089 #ifdef SSE_POSSIBLE
3090         int x, startx = span->startx, endx = span->endx;
3091         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3092         localcolor = _mm_packs_epi32(localcolor, localcolor);
3093         for (x = startx;x+2 <= endx;x+=2)
3094         {
3095                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3096                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3097                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3098                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3099         }
3100         if (x < endx)
3101         {
3102                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3103                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3104                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3105                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3106         }
3107 #endif
3108 }
3109
3110 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3111 {
3112 #ifdef SSE_POSSIBLE
3113         int x, startx = span->startx, endx = span->endx;
3114         for (x = startx;x+2 <= endx;x+=2)
3115         {
3116                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3117                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3118                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3119                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3120         }
3121         if (x < endx)
3122         {
3123                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3124                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3125                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3126                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3127         }
3128 #endif
3129 }
3130
3131 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3132 {
3133 #ifdef SSE_POSSIBLE
3134         int x, startx = span->startx, endx = span->endx;
3135         for (x = startx;x+2 <= endx;x+=2)
3136         {
3137                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3138                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3139                 pix1 = _mm_add_epi16(pix1, pix2);
3140                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3141         }
3142         if (x < endx)
3143         {
3144                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3145                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3146                 pix1 = _mm_add_epi16(pix1, pix2);
3147                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3148         }
3149 #endif
3150 }
3151
3152 #if 0
3153 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3154 {
3155 #ifdef SSE_POSSIBLE
3156         int x, startx = span->startx, endx = span->endx;
3157         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3158         tint = _mm_packs_epi32(tint, tint);
3159         for (x = startx;x+2 <= endx;x+=2)
3160         {
3161                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3162                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3163                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3164                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3165         }
3166         if (x < endx)
3167         {
3168                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3169                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3170                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3171                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3172         }
3173 #endif
3174 }
3175 #endif
3176
3177 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3178 {
3179 #ifdef SSE_POSSIBLE
3180         int x, startx = span->startx, endx = span->endx;
3181         for (x = startx;x+2 <= endx;x+=2)
3182         {
3183                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3184                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3185                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3186                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3187                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3188         }
3189         if (x < endx)
3190         {
3191                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3192                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3193                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3194                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3195                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3196         }
3197 #endif
3198 }
3199
3200 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3201 {
3202 #ifdef SSE_POSSIBLE
3203         int x, startx = span->startx, endx = span->endx;
3204         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3205         localcolor = _mm_packs_epi32(localcolor, localcolor);
3206         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3207         for (x = startx;x+2 <= endx;x+=2)
3208         {
3209                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3210                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3211                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3212         }
3213         if (x < endx)
3214         {
3215                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3216                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3217                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3218         }
3219 #endif
3220 }
3221
3222
3223
3224 static void DPSOFTRAST_VertexShader_Generic(void)
3225 {
3226         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3227         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3228         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3229         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3230                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3231 }
3232
3233 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3234 {
3235         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3236         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3237         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3238         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3240         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3241         {
3242                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3243                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3244                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3245                 {
3246                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3247                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3248                         {
3249                                 // multiply
3250                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3251                         }
3252                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3253                         {
3254                                 // add
3255                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3256                         }
3257                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3258                         {
3259                                 // alphablend
3260                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3261                         }
3262                 }
3263         }
3264         else
3265                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3266         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3267 }
3268
3269
3270
3271 static void DPSOFTRAST_VertexShader_PostProcess(void)
3272 {
3273         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3274         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3275         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3276 }
3277
3278 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3279 {
3280         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3281         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3282         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3283         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3285         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3286         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3287         {
3288                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3289                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3290         }
3291         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3292         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3293         {
3294                 // TODO: implement saturation
3295         }
3296         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3297         {
3298                 // TODO: implement gammaramps
3299         }
3300         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3301 }
3302
3303
3304
3305 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3306 {
3307         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3308 }
3309
3310 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3311 {
3312         // this is never called (because colormask is off when this shader is used)
3313         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3314         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3315         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3316         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3317         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3318 }
3319
3320
3321
3322 static void DPSOFTRAST_VertexShader_FlatColor(void)
3323 {
3324         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3325         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3326 }
3327
3328 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3329 {
3330 #ifdef SSE_POSSIBLE
3331         unsigned char * RESTRICT pixelmask = span->pixelmask;
3332         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3333         int x, startx = span->startx, endx = span->endx;
3334         __m128i Color_Ambientm;
3335         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3336         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3337         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3338         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3339         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3340         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3341                 pixel = buffer_FragColorbgra8;
3342         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3343         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3344         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3345         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3346         for (x = startx;x < endx;x++)
3347         {
3348                 __m128i color, pix;
3349                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3350                 {
3351                         __m128i pix2;
3352                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3353                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3354                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3355                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3356                         x += 3;
3357                         continue;
3358                 }
3359                 if (!pixelmask[x])
3360                         continue;
3361                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3362                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3363                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3364         }
3365         if (pixel == buffer_FragColorbgra8)
3366                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3367 #endif
3368 }
3369
3370
3371
3372 static void DPSOFTRAST_VertexShader_VertexColor(void)
3373 {
3374         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3375         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3376         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3377 }
3378
3379 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3380 {
3381 #ifdef SSE_POSSIBLE
3382         unsigned char * RESTRICT pixelmask = span->pixelmask;
3383         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3384         int x, startx = span->startx, endx = span->endx;
3385         __m128i Color_Ambientm, Color_Diffusem;
3386         __m128 data, slope;
3387         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3388         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3389         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3390         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3391         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3392         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3393         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3394                 pixel = buffer_FragColorbgra8;
3395         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3396         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3397         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3398         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3399         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3400         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3401         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3402         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3403         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3404         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3405         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3406         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3407         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3408         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3409         {
3410                 __m128i color, mod, pix;
3411                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3412                 {
3413                         __m128i pix2, mod2;
3414                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3415                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3416                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3417                         data = _mm_add_ps(data, slope);
3418                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3419                         data = _mm_add_ps(data, slope);
3420                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3421                         data = _mm_add_ps(data, slope);
3422                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3423                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3424                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3425                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3426                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3427                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3428                         x += 3;
3429                         continue;
3430                 }
3431                 if (!pixelmask[x])
3432                         continue;
3433                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3434                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3435                 mod = _mm_packs_epi32(mod, mod);
3436                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3437                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3438         }
3439         if (pixel == buffer_FragColorbgra8)
3440                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3441 #endif
3442 }
3443
3444
3445
3446 static void DPSOFTRAST_VertexShader_Lightmap(void)
3447 {
3448         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3449         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3450         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3451 }
3452
3453 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3454 {
3455 #ifdef SSE_POSSIBLE
3456         unsigned char * RESTRICT pixelmask = span->pixelmask;
3457         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3458         int x, startx = span->startx, endx = span->endx;
3459         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3460         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3461         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3462         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3463         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3464         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3465         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3466         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3467         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3468         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3469                 pixel = buffer_FragColorbgra8;
3470         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3471         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3472         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3473         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3474         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3475         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3476         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3477         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3478         {
3479                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3480                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3481                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3482                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3483                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3484                 for (x = startx;x < endx;x++)
3485                 {
3486                         __m128i color, lightmap, glow, pix;
3487                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3488                         {
3489                                 __m128i pix2;
3490                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3491                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3492                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3493                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3494                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3495                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3496                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3497                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3498                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3499                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3500                                 x += 3;
3501                                 continue;
3502                         }
3503                         if (!pixelmask[x])
3504                                 continue;
3505                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3506                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3507                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3508                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3509                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3510                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3511                 }
3512         }
3513         else
3514         {
3515                 for (x = startx;x < endx;x++)
3516                 {
3517                         __m128i color, lightmap, pix;
3518                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3519                         {
3520                                 __m128i pix2;
3521                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3522                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3523                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3524                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3525                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3526                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3527                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3528                                 x += 3;
3529                                 continue;
3530                         }
3531                         if (!pixelmask[x]) 
3532                                 continue;
3533                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3534                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3535                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3536                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3537                 }
3538         }
3539         if (pixel == buffer_FragColorbgra8)
3540                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3541 #endif
3542 }
3543
3544
3545 void DPSOFTRAST_VertexShader_LightDirection(void);
3546 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3547
3548 static void DPSOFTRAST_VertexShader_FakeLight(void)
3549 {
3550         DPSOFTRAST_VertexShader_LightDirection();
3551 }
3552
3553 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3554 {
3555         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3556 }
3557
3558
3559
3560 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3561 {
3562         DPSOFTRAST_VertexShader_LightDirection();
3563         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3564 }
3565
3566 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3567 {
3568         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3569 }
3570
3571
3572
3573 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3574 {
3575         DPSOFTRAST_VertexShader_LightDirection();
3576         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3577 }
3578
3579 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3580 {
3581         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3582 }
3583
3584
3585
3586 void DPSOFTRAST_VertexShader_LightDirection(void)
3587 {
3588         int i;
3589         int numvertices = dpsoftrast.numvertices;
3590         float LightDir[4];
3591         float LightVector[4];
3592         float EyePosition[4];
3593         float EyeVectorModelSpace[4];
3594         float EyeVector[4];
3595         float position[4];
3596         float svector[4];
3597         float tvector[4];
3598         float normal[4];
3599         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3600         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3601         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3602         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3603         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3604         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3605         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3606         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3607         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3608         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3609         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3610         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3611         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3612         for (i = 0;i < numvertices;i++)
3613         {
3614                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3615                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3616                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3617                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3618                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3619                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3620                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3621                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3622                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3623                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3624                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3625                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3626                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3627                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3628                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3629                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3630                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3631                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3632                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3633                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3634                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3635                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3636                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3637                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3638                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3639                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3640                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3641                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3642                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3643         }
3644         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3645 }
3646
3647 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3648 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3649 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3650 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3651 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3652 #define DPSOFTRAST_Vector3Normalize(v)\
3653 do\
3654 {\
3655         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3656         if (len)\
3657         {\
3658                 len = 1.0f / len;\
3659                 v[0] *= len;\
3660                 v[1] *= len;\
3661                 v[2] *= len;\
3662         }\
3663 }\
3664 while(0)
3665
3666 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3667 {
3668         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3669         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3670         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3671         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3672         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3673         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3674         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3675         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3676         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3677         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3678         int x, startx = span->startx, endx = span->endx;
3679         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3680         float LightVectordata[4];
3681         float LightVectorslope[4];
3682         float EyeVectordata[4];
3683         float EyeVectorslope[4];
3684         float VectorSdata[4];
3685         float VectorSslope[4];
3686         float VectorTdata[4];
3687         float VectorTslope[4];
3688         float VectorRdata[4];
3689         float VectorRslope[4];
3690         float z;
3691         float diffusetex[4];
3692         float glosstex[4];
3693         float surfacenormal[4];
3694         float lightnormal[4];
3695         float lightnormal_modelspace[4];
3696         float eyenormal[4];
3697         float specularnormal[4];
3698         float diffuse;
3699         float specular;
3700         float SpecularPower;
3701         int d[4];
3702         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3703         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3704         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3705         Color_Glow[3] = 0.0f;
3706         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3707         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3708         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3709         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3710         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3711         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3712         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3713         Color_Pants[3] = 0.0f;
3714         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3715         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3716         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3717         Color_Shirt[3] = 0.0f;
3718         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3719         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3720         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3721         {
3722                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3723                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3724         }
3725         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3726         {
3727                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3728         }
3729         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3730         {
3731                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3732                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3733                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3734                 Color_Diffuse[3] = 0.0f;
3735                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3736                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3737                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3738                 LightColor[3] = 0.0f;
3739                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3740                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3741                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3742                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3743                 Color_Specular[3] = 0.0f;
3744                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3745                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3746                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3747
3748                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3749                 {
3750                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3751                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3752                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3753                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3754                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3755                 }
3756                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3757                 {
3758                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3759                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3760                 }
3761                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3762                 {
3763                         // nothing of this needed
3764                 }
3765                 else
3766                 {
3767                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3768                 }
3769
3770                 for (x = startx;x < endx;x++)
3771                 {
3772                         z = buffer_z[x];
3773                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3774                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3775                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3776                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3777                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3778                         {
3779                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3780                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3781                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3782                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3783                         }
3784                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3785                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3786                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3787                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3788                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3789                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3790                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3791                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3792
3793                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3794                         {
3795                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3796                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3797                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3798                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3799
3800                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3801                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3802                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3803                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3804
3805                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3806                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3807                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3808                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3809
3810                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3811                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3812                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3813                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3814
3815                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3816                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3817
3818                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3819                                 {
3820                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3821                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3822                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3823                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3824                                 }
3825                         }
3826                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3827                         {
3828                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3829                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3830                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3831                                 {
3832                                         float f = 1.0f / 256.0f;
3833                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3834                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3835                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3836                                 }
3837                         }
3838                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3839                         {
3840                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3841                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3842                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3843                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3844
3845                                 LightColor[0] = 1.0;
3846                                 LightColor[1] = 1.0;
3847                                 LightColor[2] = 1.0;
3848                         }
3849                         else
3850                         {
3851                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3852                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3853                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3854                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3855                         }
3856
3857                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3858
3859                         if(thread->shader_exactspecularmath)
3860                         {
3861                                 // reflect lightnormal at surfacenormal, take the negative of that
3862                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3863                                 float f;
3864                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3865                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3866                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3867                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3868
3869                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3870                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3871                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3872                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3873                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3874
3875                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3876                         }
3877                         else
3878                         {
3879                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3880                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3881                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3882                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3883
3884                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3885                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3886                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3887                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3888
3889                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3890                         }
3891                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3892
3893                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3894                         {
3895                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3896                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3897                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3898                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3899                         }
3900                         else
3901                         {
3902                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3903                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3904                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3905                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3906                         }
3907
3908                         buffer_FragColorbgra8[x*4+0] = d[0];
3909                         buffer_FragColorbgra8[x*4+1] = d[1];
3910                         buffer_FragColorbgra8[x*4+2] = d[2];
3911                         buffer_FragColorbgra8[x*4+3] = d[3];
3912                 }
3913         }
3914         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3915         {
3916                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3917                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3918                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3919                 Color_Diffuse[3] = 0.0f;
3920                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3921                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3922                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3923                 LightColor[3] = 0.0f;
3924                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3925
3926                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3927                 {
3928                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3929                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3930                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3931                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3932                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3933                 }
3934                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3935                 {
3936                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3937                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3938                 }
3939                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3940                 {
3941                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3942                 }
3943                 else
3944                 {
3945                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3946                 }
3947
3948                 for (x = startx;x < endx;x++)
3949                 {
3950                         z = buffer_z[x];
3951                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3952                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3953                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3954                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3955                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3956                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3957                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3958                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3959
3960                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3961                         {
3962                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3963                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3964                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3965                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3966
3967                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3968                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3969                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3970                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3971
3972                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3973                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3974                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3975                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3976
3977                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3978                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3979                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3980                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3981
3982                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3983                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3984
3985                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3986                                 {
3987                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3988                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3989                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3990                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3991                                 }
3992                         }
3993                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3994                         {
3995                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3996                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3997                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3998                                 {
3999                                         float f = 1.0f / 256.0f;
4000                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4001                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4002                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4003                                 }
4004                         }
4005                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4006                         {
4007                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4008                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4009                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4010                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4011
4012                                 LightColor[0] = 1.0;
4013                                 LightColor[1] = 1.0;
4014                                 LightColor[2] = 1.0;
4015                         }
4016                         else
4017                         {
4018                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4019                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4020                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4021                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4022                         }
4023
4024                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4025                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4026                         {
4027                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4028                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4029                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4030                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4031                         }
4032                         else
4033                         {
4034                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4035                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4036                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4037                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4038                         }
4039                         buffer_FragColorbgra8[x*4+0] = d[0];
4040                         buffer_FragColorbgra8[x*4+1] = d[1];
4041                         buffer_FragColorbgra8[x*4+2] = d[2];
4042                         buffer_FragColorbgra8[x*4+3] = d[3];
4043                 }
4044         }
4045         else
4046         {
4047                 for (x = startx;x < endx;x++)
4048                 {
4049                         z = buffer_z[x];
4050                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4051                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4052                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4053                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4054
4055                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4056                         {
4057                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4058                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4059                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4060                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4061                         }
4062                         else
4063                         {
4064                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4065                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4066                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4067                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4068                         }
4069                         buffer_FragColorbgra8[x*4+0] = d[0];
4070                         buffer_FragColorbgra8[x*4+1] = d[1];
4071                         buffer_FragColorbgra8[x*4+2] = d[2];
4072                         buffer_FragColorbgra8[x*4+3] = d[3];
4073                 }
4074         }
4075         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4076 }
4077
4078
4079
4080 static void DPSOFTRAST_VertexShader_LightSource(void)
4081 {
4082         int i;
4083         int numvertices = dpsoftrast.numvertices;
4084         float LightPosition[4];
4085         float LightVector[4];
4086         float LightVectorModelSpace[4];
4087         float EyePosition[4];
4088         float EyeVectorModelSpace[4];
4089         float EyeVector[4];
4090         float position[4];
4091         float svector[4];
4092         float tvector[4];
4093         float normal[4];
4094         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4095         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4096         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4097         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4098         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4099         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4100         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4101         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4102         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4103         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4104         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4105         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4106         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4107         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4108         for (i = 0;i < numvertices;i++)
4109         {
4110                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4111                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4112                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4113                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4114                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4115                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4116                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4117                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4118                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4119                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4120                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4121                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4122                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4123                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4124                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4125                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4126                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4127                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4128                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4129                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4130                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4131                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4132                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4133                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4134                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4135                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4136                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4137                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4138                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4139                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4140                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4141                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4142         }
4143         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4144         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4145 }
4146
4147 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4148 {
4149 #ifdef SSE_POSSIBLE
4150         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4151         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4152         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4153         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4154         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4155         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4156         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4157         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4158         int x, startx = span->startx, endx = span->endx;
4159         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4160         float CubeVectordata[4];
4161         float CubeVectorslope[4];
4162         float LightVectordata[4];
4163         float LightVectorslope[4];
4164         float EyeVectordata[4];
4165         float EyeVectorslope[4];
4166         float z;
4167         float diffusetex[4];
4168         float glosstex[4];
4169         float surfacenormal[4];
4170         float lightnormal[4];
4171         float eyenormal[4];
4172         float specularnormal[4];
4173         float diffuse;
4174         float specular;
4175         float SpecularPower;
4176         float CubeVector[4];
4177         float attenuation;
4178         int d[4];
4179         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4180         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4181         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4182         Color_Glow[3] = 0.0f;
4183         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4184         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4185         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4186         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4187         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4188         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4189         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4190         Color_Diffuse[3] = 0.0f;
4191         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4192         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4193         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4194         Color_Specular[3] = 0.0f;
4195         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4196         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4197         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4198         Color_Pants[3] = 0.0f;
4199         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4200         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4201         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4202         Color_Shirt[3] = 0.0f;
4203         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4204         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4205         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4206         LightColor[3] = 0.0f;
4207         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4208         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4209         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4210         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4211         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4212         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4213         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4214         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4215         {
4216                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4217                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4218         }
4219         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4220                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4221         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4222         {
4223                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4224                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4225                 for (x = startx;x < endx;x++)
4226                 {
4227                         z = buffer_z[x];
4228                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4229                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4230                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4231                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4232                         if (attenuation < 0.01f)
4233                                 continue;
4234                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4235                         {
4236                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4237                                 if (attenuation < 0.01f)
4238                                         continue;
4239                         }
4240
4241                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4242                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4243                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4244                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4245                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4246                         {
4247                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4248                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4249                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4250                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4251                         }
4252                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4253                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4254                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4255                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4256                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4257                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4258                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4259                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4260
4261                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4262                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4263                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4264                         DPSOFTRAST_Vector3Normalize(lightnormal);
4265
4266                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4267
4268                         if(thread->shader_exactspecularmath)
4269                         {
4270                                 // reflect lightnormal at surfacenormal, take the negative of that
4271                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4272                                 float f;
4273                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4274                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4275                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4276                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4277
4278                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4279                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4280                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4281                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4282                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4283
4284                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4285                         }
4286                         else
4287                         {
4288                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4289                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4290                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4291                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4292
4293                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4294                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4295                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4296                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4297
4298                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4299                         }
4300                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4301
4302                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4303                         {
4304                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4305                                 attenuation *= (1.0f / 255.0f);
4306                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4307                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4308                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4309                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4310                         }
4311                         else
4312                         {
4313                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4314                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4315                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4316                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4317                         }
4318                         buffer_FragColorbgra8[x*4+0] = d[0];
4319                         buffer_FragColorbgra8[x*4+1] = d[1];
4320                         buffer_FragColorbgra8[x*4+2] = d[2];
4321                         buffer_FragColorbgra8[x*4+3] = d[3];
4322                 }
4323         }
4324         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4325         {
4326                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4327                 for (x = startx;x < endx;x++)
4328                 {
4329                         z = buffer_z[x];
4330                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4331                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4332                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4333                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4334                         if (attenuation < 0.01f)
4335                                 continue;
4336                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4337                         {
4338                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4339                                 if (attenuation < 0.01f)
4340                                         continue;
4341                         }
4342
4343                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4344                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4345                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4346                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4347                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4348                         {
4349                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4350                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4351                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4352                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4353                         }
4354                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4355                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4356                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4357                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4358
4359                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4360                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4361                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4362                         DPSOFTRAST_Vector3Normalize(lightnormal);
4363
4364                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4365                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4366                         {
4367                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4368                                 attenuation *= (1.0f / 255.0f);
4369                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4370                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4371                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4372                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4373                         }
4374                         else
4375                         {
4376                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4377                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4378                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4379                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4380                         }
4381                         buffer_FragColorbgra8[x*4+0] = d[0];
4382                         buffer_FragColorbgra8[x*4+1] = d[1];
4383                         buffer_FragColorbgra8[x*4+2] = d[2];
4384                         buffer_FragColorbgra8[x*4+3] = d[3];
4385                 }
4386         }
4387         else
4388         {
4389                 for (x = startx;x < endx;x++)
4390                 {
4391                         z = buffer_z[x];
4392                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4393                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4394                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4395                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4396                         if (attenuation < 0.01f)
4397                                 continue;
4398                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4399                         {
4400                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4401                                 if (attenuation < 0.01f)
4402                                         continue;
4403                         }
4404
4405                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4406                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4407                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4408                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4409                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4410                         {
4411                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4412                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4413                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4414                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4415                         }
4416                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4417                         {
4418                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4419                                 attenuation *= (1.0f / 255.0f);
4420                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4421                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4422                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4423                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4424                         }
4425                         else
4426                         {
4427                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4428                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4429                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4430                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4431                         }
4432                         buffer_FragColorbgra8[x*4+0] = d[0];
4433                         buffer_FragColorbgra8[x*4+1] = d[1];
4434                         buffer_FragColorbgra8[x*4+2] = d[2];
4435                         buffer_FragColorbgra8[x*4+3] = d[3];
4436                 }
4437         }
4438         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4439 #endif
4440 }
4441
4442
4443
4444 static void DPSOFTRAST_VertexShader_Refraction(void)
4445 {
4446         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4447         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4448         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4449 }
4450
4451 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4452 {
4453         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4454         float z;
4455         int x, startx = span->startx, endx = span->endx;
4456
4457         // texture reads
4458         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4459         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4460
4461         // varyings
4462         float ModelViewProjectionPositiondata[4];
4463         float ModelViewProjectionPositionslope[4];
4464
4465         // uniforms
4466         float ScreenScaleRefractReflect[2];
4467         float ScreenCenterRefractReflect[2];
4468         float DistortScaleRefractReflect[2];
4469         float RefractColor[4];
4470
4471         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4472         if(!texture) return;
4473
4474         // read textures
4475         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4476         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4477
4478         // read varyings
4479         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4480
4481         // read uniforms
4482         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4483         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4484         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4485         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4486         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4487         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4488         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4489         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4490         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4491         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4492
4493         // do stuff
4494         for (x = startx;x < endx;x++)
4495         {
4496                 float SafeScreenTexCoord[2];
4497                 float ScreenTexCoord[2];
4498                 float v[3];
4499                 float iw;
4500                 unsigned char c[4];
4501
4502                 z = buffer_z[x];
4503
4504                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4505                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4506
4507                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4508                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4509                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4510
4511                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4512                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4513                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4514                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4515                 DPSOFTRAST_Vector3Normalize(v);
4516                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4517                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4518
4519                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4520                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4521
4522                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4523                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4524                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4525                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4526         }
4527
4528         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4529 }
4530
4531
4532
4533 static void DPSOFTRAST_VertexShader_Water(void)
4534 {
4535         int i;
4536         int numvertices = dpsoftrast.numvertices;
4537         float EyePosition[4];
4538         float EyeVectorModelSpace[4];
4539         float EyeVector[4];
4540         float position[4];
4541         float svector[4];
4542         float tvector[4];
4543         float normal[4];
4544         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4545         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4546         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4547         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4548         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4549         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4550         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4551         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4552         for (i = 0;i < numvertices;i++)
4553         {
4554                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4555                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4556                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4557                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4558                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4559                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4560                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4561                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4562                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4563                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4564                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4565                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4566                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4567                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4568                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4569                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4570                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4571                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4572                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4573                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4574                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4575                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4576         }
4577         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4578         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4579         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4580 }
4581
4582
4583 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4584 {
4585         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4586         float z;
4587         int x, startx = span->startx, endx = span->endx;
4588
4589         // texture reads
4590         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4591         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4592
4593         // varyings
4594         float ModelViewProjectionPositiondata[4];
4595         float ModelViewProjectionPositionslope[4];
4596         float EyeVectordata[4];
4597         float EyeVectorslope[4];
4598
4599         // uniforms
4600         float ScreenScaleRefractReflect[4];
4601         float ScreenCenterRefractReflect[4];
4602         float DistortScaleRefractReflect[4];
4603         float RefractColor[4];
4604         float ReflectColor[4];
4605         float ReflectFactor;
4606         float ReflectOffset;
4607
4608         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4609         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4610         if(!texture_refraction || !texture_reflection) return;
4611
4612         // read textures
4613         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4614         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4615
4616         // read varyings
4617         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4618         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4619
4620         // read uniforms
4621         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4622         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4623         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4624         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4625         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4626         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4627         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4628         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4629         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4630         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4631         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4632         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4633         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4634         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4635         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4636         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4637         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4638         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4639         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4640         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4641         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4642         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4643
4644         // do stuff
4645         for (x = startx;x < endx;x++)
4646         {
4647                 float SafeScreenTexCoord[4];
4648                 float ScreenTexCoord[4];
4649                 float v[3];
4650                 float iw;
4651                 unsigned char c1[4];
4652                 unsigned char c2[4];
4653                 float Fresnel;
4654
4655                 z = buffer_z[x];
4656
4657                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4658                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4659
4660                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4661                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4662                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4663                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4664                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4665
4666                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4667                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4668                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4669                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4670                 DPSOFTRAST_Vector3Normalize(v);
4671                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4672                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4673                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4674                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4675
4676                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4677                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4678                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4679                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4680                 DPSOFTRAST_Vector3Normalize(v);
4681                 Fresnel = 1.0f - v[2];
4682                 Fresnel = min(1.0f, Fresnel);
4683                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4684
4685                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4686                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4687                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4688                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4689
4690                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4691                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4692                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4693                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4694         }
4695
4696         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4697 }
4698
4699
4700
4701 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4702 {
4703         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4704 }
4705
4706 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4707 {
4708         // TODO: IMPLEMENT
4709         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4710         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4711         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4712         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4713         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4714 }
4715
4716
4717
4718 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4719 {
4720         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4721 }
4722
4723 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4724 {
4725         // TODO: IMPLEMENT
4726         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4727         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4728         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4729         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4730         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4731 }
4732
4733
4734
4735 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4736 {
4737         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4738 }
4739
4740 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4741 {
4742         // TODO: IMPLEMENT
4743         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4744         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4745         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4746         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4747         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4748 }
4749
4750
4751
4752 typedef struct DPSOFTRAST_ShaderModeInfo_s
4753 {
4754         int lodarrayindex;
4755         void (*Vertex)(void);
4756         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4757         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4758         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4759 }
4760 DPSOFTRAST_ShaderModeInfo;
4761
4762 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4763 {
4764         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4765         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4766         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4767         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4768         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4769         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4770         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4771         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4772         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4773         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4774         {2, DPSOFTRAST_VertexShader_VertexColor,                        DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4775         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4776         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4777         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4778         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4779         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4780         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4781         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4782 };
4783
4784 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4785 {
4786         int x;
4787         int startx;
4788         int endx;
4789         unsigned int *depthpixel;
4790         int depth;
4791         int depthslope;
4792         unsigned int d;
4793         unsigned char *pixelmask;
4794         DPSOFTRAST_State_Triangle *triangle;
4795         triangle = &thread->triangles[span->triangle];
4796         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4797         startx = span->startx;
4798         endx = span->endx;
4799         depth = span->depthbase;
4800         depthslope = span->depthslope;
4801         pixelmask = thread->pixelmaskarray;
4802         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4803         {
4804                 switch(thread->fb_depthfunc)
4805                 {
4806                 default:
4807                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4808                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4809                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4810                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4811                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4812                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4813                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4814                 }
4815                 while (startx < endx && !pixelmask[startx])
4816                         startx++;
4817                 while (endx > startx && !pixelmask[endx-1])
4818                         endx--;
4819         }
4820         else
4821         {
4822                 // no depth testing means we're just dealing with color...
4823                 memset(pixelmask + startx, 1, endx - startx);
4824         }
4825         span->pixelmask = pixelmask;
4826         span->startx = startx;
4827         span->endx = endx;
4828 }
4829
4830 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4831 {
4832         int x, d, depth, depthslope, startx, endx;
4833         const unsigned char *pixelmask;
4834         unsigned int *depthpixel;
4835         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4836         {
4837                 depth = span->depthbase;
4838                 depthslope = span->depthslope;
4839                 pixelmask = span->pixelmask;
4840                 startx = span->startx;
4841                 endx = span->endx;
4842                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4843                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4844                         if (pixelmask[x])
4845                                 depthpixel[x] = d;
4846         }
4847 }
4848
4849 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4850 {
4851         int i;
4852         DPSOFTRAST_State_Triangle *triangle;
4853         DPSOFTRAST_State_Span *span;
4854         for (i = 0; i < thread->numspans; i++)
4855         {
4856                 span = &thread->spans[i];
4857                 triangle = &thread->triangles[span->triangle];
4858                 DPSOFTRAST_Draw_DepthTest(thread, span);
4859                 if (span->startx >= span->endx)
4860                         continue;
4861                 // run pixel shader if appropriate
4862                 // do this before running depthmask code, to allow the pixelshader
4863                 // to clear pixelmask values for alpha testing
4864                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4865                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4866                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4867         }
4868         thread->numspans = 0;
4869 }
4870
4871 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4872
4873 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4874 {
4875 #ifdef SSE_POSSIBLE
4876         int cullface = thread->cullface;
4877         int minx, maxx, miny, maxy;
4878         int miny1, maxy1, miny2, maxy2;
4879         __m128i fbmin, fbmax;
4880         __m128 viewportcenter, viewportscale;
4881         int firstvertex = command->firstvertex;
4882         int numvertices = command->numvertices;
4883         int numtriangles = command->numtriangles;
4884         const int *element3i = command->element3i;
4885         const unsigned short *element3s = command->element3s;
4886         int clipped = command->clipped;
4887         int i;
4888         int j;
4889         int k;
4890         int y;
4891         int e[3];
4892         __m128i screeny;
4893         int starty, endy, bandy;
4894         int numpoints;
4895         int clipcase;
4896         float clipdist[4];
4897         float clip0origin, clip0slope;
4898         int clip0dir;
4899         __m128 triangleedge1, triangleedge2, trianglenormal;
4900         __m128 clipfrac[3];
4901         __m128 screen[4];
4902         DPSOFTRAST_State_Triangle *triangle;
4903         DPSOFTRAST_Texture *texture;
4904         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4905         miny = thread->fb_scissor[1];
4906         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4907         miny1 = bound(miny, thread->miny1, maxy);
4908         maxy1 = bound(miny, thread->maxy1, maxy);
4909         miny2 = bound(miny, thread->miny2, maxy);
4910         maxy2 = bound(miny, thread->maxy2, maxy);
4911         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4912         {
4913                 if (!ATOMIC_DECREMENT(command->refcount))
4914                 {
4915                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4916                                 MM_FREE(command->arrays);
4917                 }
4918                 return;
4919         }
4920         minx = thread->fb_scissor[0];
4921         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4922         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4923         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4924         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4925         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4926         screen[3] = _mm_setzero_ps();
4927         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4928         for (i = 0;i < numtriangles;i++)
4929         {
4930                 const float *screencoord4f = command->arrays;
4931                 const float *arrays = screencoord4f + numvertices*4;
4932
4933                 // generate the 3 edges of this triangle
4934                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4935                 if (element3s)
4936                 {
4937                         e[0] = element3s[i*3+0] - firstvertex;
4938                         e[1] = element3s[i*3+1] - firstvertex;
4939                         e[2] = element3s[i*3+2] - firstvertex;
4940                 }
4941                 else if (element3i)
4942                 {
4943                         e[0] = element3i[i*3+0] - firstvertex;
4944                         e[1] = element3i[i*3+1] - firstvertex;
4945                         e[2] = element3i[i*3+2] - firstvertex;
4946                 }
4947                 else
4948                 {
4949                         e[0] = i*3+0;
4950                         e[1] = i*3+1;
4951                         e[2] = i*3+2;
4952                 }
4953
4954 #define SKIPBACKFACE \
4955                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4956                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4957                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4958                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4959                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4960                 switch(cullface) \
4961                 { \
4962                 case GL_BACK: \
4963                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4964                                 continue; \
4965                         break; \
4966                 case GL_FRONT: \
4967                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4968                                 continue; \
4969                         break; \
4970                 }
4971
4972 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4973                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4974                         { \
4975                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4976                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4977                         }
4978 #define CLIPPEDVERTEXCOPY(k,p1) \
4979                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4980
4981 #define GENATTRIBCOPY(attrib, p1) \
4982                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4983 #define GENATTRIBLERP(attrib, p1, p2) \
4984                 { \
4985                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4986                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4987                 }
4988 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4989                 switch(clipcase) \
4990                 { \
4991                 default: \
4992                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4993                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4994                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4995                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4996                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4997                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4998                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4999                 }
5000
5001                 if (! clipped)
5002                         goto notclipped;
5003
5004                 // calculate distance from nearplane
5005                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5006                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5007                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5008                 if (clipdist[0] >= 0.0f)
5009                 {
5010                         if (clipdist[1] >= 0.0f)
5011                         {
5012                                 if (clipdist[2] >= 0.0f)
5013                                 {
5014                                 notclipped:
5015                                         // triangle is entirely in front of nearplane
5016                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5017                                         SKIPBACKFACE;
5018                                         numpoints = 3;
5019                                         clipcase = 0;
5020                                 }
5021                                 else
5022                                 {
5023                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5024                                         SKIPBACKFACE;
5025                                         numpoints = 4;
5026                                         clipcase = 1;
5027                                 }
5028                         }
5029                         else
5030                         {
5031                                 if (clipdist[2] >= 0.0f)
5032                                 {
5033                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5034                                         SKIPBACKFACE;
5035                                         numpoints = 4;
5036                                         clipcase = 2;
5037                                 }
5038                                 else
5039                                 {
5040                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5041                                         SKIPBACKFACE;
5042                                         numpoints = 3;
5043                                         clipcase = 3;
5044                                 }
5045                         }
5046                 }
5047                 else if (clipdist[1] >= 0.0f)
5048                 {
5049                         if (clipdist[2] >= 0.0f)
5050                         {
5051                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5052                                 SKIPBACKFACE;
5053                                 numpoints = 4;
5054                                 clipcase = 4;
5055                         }
5056                         else
5057                         {
5058                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5059                                 SKIPBACKFACE;
5060                                 numpoints = 3;
5061                                 clipcase = 5;
5062                         }
5063                 }
5064                 else if (clipdist[2] >= 0.0f)
5065                 {
5066                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5067                         SKIPBACKFACE;
5068                         numpoints = 3;
5069                         clipcase = 6;
5070                 }
5071                 else continue; // triangle is entirely behind nearplane
5072
5073                 {
5074                         // calculate integer y coords for triangle points
5075                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5076                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5077                                         screenmin = _mm_min_epi16(screeni, screenir),
5078                                         screenmax = _mm_max_epi16(screeni, screenir);
5079                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5080                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5081                         screenmin = _mm_max_epi16(screenmin, fbmin);
5082                         screenmax = _mm_min_epi16(screenmax, fbmax);
5083                         // skip offscreen triangles
5084                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5085                                 continue;
5086                         starty = _mm_extract_epi16(screenmin, 1);
5087                         endy = _mm_extract_epi16(screenmax, 1)+1;
5088                         if (starty >= maxy1 && endy <= miny2)
5089                                 continue;
5090                         screeny = _mm_srai_epi32(screeni, 16);
5091                 }
5092
5093                 triangle = &thread->triangles[thread->numtriangles];
5094
5095                 // calculate attribute plans for triangle data...
5096                 // okay, this triangle is going to produce spans, we'd better project
5097                 // the interpolants now (this is what gives perspective texturing),
5098                 // this consists of simply multiplying all arrays by the W coord
5099                 // (which is basically 1/Z), which will be undone per-pixel
5100                 // (multiplying by Z again) to get the perspective-correct array
5101                 // values
5102                 {
5103                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5104                         __m128 mipedgescale, mipdensity;
5105                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5106                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5107                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5108                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5109                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5110                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5111                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5112                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5113                         attribedge1 = _mm_sub_ss(w0, w1);
5114                         attribedge2 = _mm_sub_ss(w2, w1);
5115                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5116                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5117                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5118                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5119                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5120                         _mm_store_ss(&triangle->w[0], attribxslope);
5121                         _mm_store_ss(&triangle->w[1], attribyslope);
5122                         _mm_store_ss(&triangle->w[2], attriborigin);
5123                         
5124                         clip0origin = 0;
5125                         clip0slope = 0;
5126                         clip0dir = 0;
5127                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5128                         {
5129                                 float cliporigin, clipxslope, clipyslope;
5130                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5131                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5132                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5133                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5134                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5135                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5136                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5137                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5138                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5139                                 if(clipxslope != 0)
5140                                 {
5141                                         clip0origin = -cliporigin/clipxslope;
5142                                         clip0slope = -clipyslope/clipxslope;
5143                                         clip0dir = clipxslope > 0 ? 1 : -1;
5144                                 }
5145                                 else if(clipyslope > 0)
5146                                 {
5147                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5148                                         clip0slope = dpsoftrast.fb_width;
5149                                         clip0dir = -1;
5150                                 }
5151                                 else if(clipyslope < 0)
5152                                 {
5153                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5154                                         clip0slope = -dpsoftrast.fb_width;
5155                                         clip0dir = -1;
5156                                 }
5157                                 else if(clip0origin < 0) continue;
5158                         }
5159
5160                         mipedgescale = _mm_setzero_ps();
5161                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5162                         {
5163                                 __m128 attrib0, attrib1, attrib2;
5164                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5165                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5166                                         break;
5167                                 arrays += numvertices*4;
5168                                 GENATTRIBS(attrib0, attrib1, attrib2);
5169                                 attriborigin = _mm_mul_ps(attrib1, w1);
5170                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5171                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5172                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5173                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5174                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5175                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5176                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5177                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5178                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5179                                 {
5180                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5181                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5182                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5183                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5184                                 }
5185                         }
5186
5187                         memset(triangle->mip, 0, sizeof(triangle->mip));
5188                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5189                         {
5190                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5191                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5192                                         break;
5193                                 texture = thread->texbound[texunit];
5194                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5195                                 {
5196                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5197                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5198                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5199                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5200                                         // this will be multiplied in the texturing routine by the texture resolution
5201                                         y = _mm_cvtss_si32(mipdensity);
5202                                         if (y > 0)
5203                                         {
5204                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5205                                                 if (y > texture->mipmaps - 1)
5206                                                         y = texture->mipmaps - 1;
5207                                                 triangle->mip[texunit] = y;
5208                                         }
5209                                 }
5210                         }
5211                 }
5212         
5213                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5214                 for (; y < bandy;)
5215                 {
5216                         __m128 xcoords, xslope;
5217                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5218                         int yccmask = _mm_movemask_epi8(ycc);
5219                         int edge0p, edge0n, edge1p, edge1n;
5220                         int nexty;
5221                         float w, wslope;
5222                         float clip0;
5223                         if (numpoints == 4)
5224                         {
5225                                 switch(yccmask)
5226                                 {
5227                                 default:
5228                                 case 0xFFFF: /*0000*/ y = endy; continue;
5229                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5230                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5231                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5232                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5233                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5234                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5235                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5236                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5237                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5238                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5239                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5240                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5241                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5242                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5243                                 case 0x0000: /*1111*/ y++; continue;
5244                                 }
5245                         }
5246                         else
5247                         {
5248                                 switch(yccmask)
5249                                 {
5250                                 default:
5251                                 case 0xFFFF: /*000*/ y = endy; continue;
5252                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5253                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5254                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5255                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5256                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5257                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5258                                 case 0x0000: /*111*/ y++; continue;
5259                                 }
5260                         }
5261                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5262                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5263                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5264                         nexty = _mm_extract_epi16(ycc, 0);
5265                         if (nexty >= bandy) nexty = bandy-1;
5266                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5267                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5268                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5269                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5270                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5271                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5272                         {
5273                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5274                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5275                         }
5276                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5277                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5278                         {
5279                                 int startx, endx, offset;
5280                                 startx = _mm_cvtss_si32(xcoords);
5281                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5282                                 if (startx < minx) startx = minx;
5283                                 if (endx > maxx) endx = maxx;
5284                                 if (startx >= endx) continue;
5285
5286                                 if (clip0dir)
5287                                 {
5288                                         if (clip0dir > 0)
5289                                         {
5290                                                 if (startx < clip0) 
5291                                                 {
5292                                                         if(endx <= clip0) continue;
5293                                                         startx = (int)clip0;
5294                                                 }
5295                                         }
5296                                         else if (endx > clip0) 
5297                                         {
5298                                                 if(startx >= clip0) continue;
5299                                                 endx = (int)clip0;
5300                                         }
5301                                 }
5302                                                 
5303                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5304                                 {
5305                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5306                                         span->triangle = thread->numtriangles;
5307                                         span->x = offset;
5308                                         span->y = y;
5309                                         span->startx = 0;
5310                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5311                                         if (span->startx >= span->endx)
5312                                                 continue;
5313                                         wslope = triangle->w[0];
5314                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5315                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5316                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5317                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5318                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5319                                 }
5320                         }
5321                 }
5322
5323                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5324                 {
5325                         DPSOFTRAST_Draw_ProcessSpans(thread);
5326                         thread->numtriangles = 0;
5327                 }
5328         }
5329
5330         if (!ATOMIC_DECREMENT(command->refcount))
5331         {
5332                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5333                         MM_FREE(command->arrays);
5334         }
5335
5336         if (thread->numspans > 0 || thread->numtriangles > 0)
5337         {
5338                 DPSOFTRAST_Draw_ProcessSpans(thread);
5339                 thread->numtriangles = 0;
5340         }
5341 #endif
5342 }
5343
5344 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5345 {
5346         int i;
5347         int j;
5348         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5349         int datasize = 2*numvertices*sizeof(float[4]);
5350         DPSOFTRAST_Command_Draw *command;
5351         unsigned char *data;
5352         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5353         {
5354                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5355                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5356                         break;
5357                 datasize += numvertices*sizeof(float[4]);
5358         }
5359         if (element3s)
5360                 datasize += numtriangles*sizeof(unsigned short[3]);
5361         else if (element3i)
5362                 datasize += numtriangles*sizeof(int[3]);
5363         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5364         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5365         {
5366                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5367                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5368         }
5369         else
5370         {
5371                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5372                 data = (unsigned char *)command + commandsize;
5373         }
5374         command->firstvertex = firstvertex;
5375         command->numvertices = numvertices;
5376         command->numtriangles = numtriangles;
5377         command->arrays = (float *)data;
5378         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5379         dpsoftrast.firstvertex = firstvertex;
5380         dpsoftrast.numvertices = numvertices;
5381         dpsoftrast.screencoord4f = (float *)data;
5382         data += numvertices*sizeof(float[4]);
5383         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5384         data += numvertices*sizeof(float[4]);
5385         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5386         {
5387                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5388                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5389                         break;
5390                 dpsoftrast.post_array4f[j] = (float *)data;
5391                 data += numvertices*sizeof(float[4]);
5392         }
5393         command->element3i = NULL;
5394         command->element3s = NULL;
5395         if (element3s)
5396         {
5397                 command->element3s = (unsigned short *)data;
5398                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5399         }
5400         else if (element3i)
5401         {
5402                 command->element3i = (int *)data;
5403                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5404         }
5405         return command;
5406 }
5407
5408 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5409 {
5410         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5411         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5412         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5413         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5414         if (command->starty >= command->endy)
5415         {
5416                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5417                         MM_FREE(command->arrays);
5418                 DPSOFTRAST_UndoCommand(command->commandsize);
5419                 return;
5420         }
5421         command->clipped = dpsoftrast.drawclipped;
5422         command->refcount = dpsoftrast.numthreads;
5423
5424         if (dpsoftrast.usethreads)
5425         {
5426                 int i;
5427                 DPSOFTRAST_Draw_SyncCommands();
5428                 for (i = 0; i < dpsoftrast.numthreads; i++)
5429                 {
5430                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5431                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5432                                 Thread_CondSignal(thread->drawcond);
5433                 }
5434         }
5435         else
5436         {
5437                 DPSOFTRAST_Draw_FlushThreads();
5438         }
5439 }
5440
5441 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5442 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5443 {
5444         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5445 }
5446 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5447 {
5448         DPSOFTRAST_Command_SetRenderTargets *command;
5449         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5450                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5451                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5452                 DPSOFTRAST_Flush();
5453         dpsoftrast.fb_width = width;
5454         dpsoftrast.fb_height = height;
5455         dpsoftrast.fb_depthpixels = depthpixels;
5456         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5457         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5458         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5459         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5460         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5461         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5462         command->width = width;
5463         command->height = height;
5464 }
5465  
5466 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5467 {
5468         int commandoffset = thread->commandoffset;
5469         while (commandoffset != endoffset)
5470         {
5471                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5472                 switch (command->opcode)
5473                 {
5474 #define INTERPCOMMAND(name) \
5475                 case DPSOFTRAST_OPCODE_##name : \
5476                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5477                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5478                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5479                                 commandoffset = 0; \
5480                         break;
5481                 INTERPCOMMAND(Viewport)
5482                 INTERPCOMMAND(ClearColor)
5483                 INTERPCOMMAND(ClearDepth)
5484                 INTERPCOMMAND(ColorMask)
5485                 INTERPCOMMAND(DepthTest)
5486                 INTERPCOMMAND(ScissorTest)
5487                 INTERPCOMMAND(Scissor)
5488                 INTERPCOMMAND(BlendFunc)
5489                 INTERPCOMMAND(BlendSubtract)
5490                 INTERPCOMMAND(DepthMask)
5491                 INTERPCOMMAND(DepthFunc)
5492                 INTERPCOMMAND(DepthRange)
5493                 INTERPCOMMAND(PolygonOffset)
5494                 INTERPCOMMAND(CullFace)
5495                 INTERPCOMMAND(SetTexture)
5496                 INTERPCOMMAND(SetShader)
5497                 INTERPCOMMAND(Uniform4f)
5498                 INTERPCOMMAND(UniformMatrix4f)
5499                 INTERPCOMMAND(Uniform1i)
5500                 INTERPCOMMAND(SetRenderTargets)
5501                 INTERPCOMMAND(ClipPlane)
5502
5503                 case DPSOFTRAST_OPCODE_Draw:
5504                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5505                         commandoffset += command->commandsize;
5506                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5507                                 commandoffset = 0;
5508                         thread->commandoffset = commandoffset;
5509                         break;
5510
5511                 case DPSOFTRAST_OPCODE_Reset:
5512                         commandoffset = 0;
5513                         break;
5514                 }
5515         }
5516         thread->commandoffset = commandoffset;
5517 }
5518
5519 static int DPSOFTRAST_Draw_Thread(void *data)
5520 {
5521         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5522         while(thread->index >= 0)
5523         {
5524                 if (thread->commandoffset != dpsoftrast.drawcommand)
5525                 {
5526                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5527                 }
5528                 else 
5529                 {
5530                         Thread_LockMutex(thread->drawmutex);
5531                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5532                         {
5533                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5534                                 thread->starving = true;
5535                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5536                                 thread->starving = false;
5537                         }
5538                         Thread_UnlockMutex(thread->drawmutex);
5539                 }
5540         }   
5541         return 0;
5542 }
5543
5544 static void DPSOFTRAST_Draw_FlushThreads(void)
5545 {
5546         DPSOFTRAST_State_Thread *thread;
5547         int i;
5548         DPSOFTRAST_Draw_SyncCommands();
5549         if (dpsoftrast.usethreads) 
5550         {
5551                 for (i = 0; i < dpsoftrast.numthreads; i++)
5552                 {
5553                         thread = &dpsoftrast.threads[i];
5554                         if (thread->commandoffset != dpsoftrast.drawcommand)
5555                         {
5556                                 Thread_LockMutex(thread->drawmutex);
5557                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5558                                         Thread_CondSignal(thread->drawcond);
5559                                 Thread_UnlockMutex(thread->drawmutex);
5560                         }
5561                 }
5562                 for (i = 0; i < dpsoftrast.numthreads; i++)
5563                 {
5564                         thread = &dpsoftrast.threads[i];
5565                         if (thread->commandoffset != dpsoftrast.drawcommand)
5566                         {
5567                                 Thread_LockMutex(thread->drawmutex);
5568                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5569                                 {
5570                                         thread->waiting = true;
5571                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5572                                         thread->waiting = false;
5573                                 }
5574                                 Thread_UnlockMutex(thread->drawmutex);
5575                         }
5576                 }
5577         }
5578         else
5579         {
5580                 for (i = 0; i < dpsoftrast.numthreads; i++)
5581                 {
5582                         thread = &dpsoftrast.threads[i];
5583                         if (thread->commandoffset != dpsoftrast.drawcommand)
5584                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5585                 }
5586         }
5587         dpsoftrast.commandpool.usedcommands = 0;
5588 }
5589
5590 void DPSOFTRAST_Flush(void)
5591 {
5592         DPSOFTRAST_Draw_FlushThreads();
5593 }
5594
5595 void DPSOFTRAST_Finish(void)
5596 {
5597         DPSOFTRAST_Flush();
5598 }
5599
5600 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5601 {
5602         int i;
5603         union
5604         {
5605                 int i;
5606                 unsigned char b[4];
5607         }
5608         u;
5609         u.i = 1;
5610         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5611         dpsoftrast.bigendian = u.b[3];
5612         dpsoftrast.fb_width = width;
5613         dpsoftrast.fb_height = height;
5614         dpsoftrast.fb_depthpixels = depthpixels;
5615         dpsoftrast.fb_colorpixels[0] = colorpixels;
5616         dpsoftrast.fb_colorpixels[1] = NULL;
5617         dpsoftrast.fb_colorpixels[1] = NULL;
5618         dpsoftrast.fb_colorpixels[1] = NULL;
5619         dpsoftrast.viewport[0] = 0;
5620         dpsoftrast.viewport[1] = 0;
5621         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5622         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5623         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5624         dpsoftrast.texture_firstfree = 1;
5625         dpsoftrast.texture_end = 1;
5626         dpsoftrast.texture_max = 0;
5627         dpsoftrast.color[0] = 1;
5628         dpsoftrast.color[1] = 1;
5629         dpsoftrast.color[2] = 1;
5630         dpsoftrast.color[3] = 1;
5631         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5632         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5633         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5634         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5635         for (i = 0; i < dpsoftrast.numthreads; i++)
5636         {
5637                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5638                 thread->index = i;
5639                 thread->cullface = GL_BACK;
5640         thread->colormask[0] = 1; 
5641                 thread->colormask[1] = 1;
5642                 thread->colormask[2] = 1;
5643                 thread->colormask[3] = 1;
5644                 thread->blendfunc[0] = GL_ONE;
5645                 thread->blendfunc[1] = GL_ZERO;
5646                 thread->depthmask = true;
5647                 thread->depthtest = true;
5648                 thread->depthfunc = GL_LEQUAL;
5649                 thread->scissortest = false;
5650                 thread->viewport[0] = 0;
5651                 thread->viewport[1] = 0;
5652                 thread->viewport[2] = dpsoftrast.fb_width;
5653                 thread->viewport[3] = dpsoftrast.fb_height;
5654                 thread->scissor[0] = 0;
5655                 thread->scissor[1] = 0;
5656                 thread->scissor[2] = dpsoftrast.fb_width;
5657                 thread->scissor[3] = dpsoftrast.fb_height;
5658                 thread->depthrange[0] = 0;
5659                 thread->depthrange[1] = 1;
5660                 thread->polygonoffset[0] = 0;
5661                 thread->polygonoffset[1] = 0;
5662                 thread->clipplane[0] = 0;
5663                 thread->clipplane[1] = 0;
5664                 thread->clipplane[2] = 0;
5665                 thread->clipplane[3] = 1;
5666         
5667                 thread->numspans = 0;
5668                 thread->numtriangles = 0;
5669                 thread->commandoffset = 0;
5670                 thread->waiting = false;
5671                 thread->starving = false;
5672            
5673                 thread->validate = -1;
5674                 DPSOFTRAST_Validate(thread, -1);
5675  
5676                 if (dpsoftrast.usethreads)
5677                 {
5678                         thread->waitcond = Thread_CreateCond();
5679                         thread->drawcond = Thread_CreateCond();
5680                         thread->drawmutex = Thread_CreateMutex();
5681                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5682                 }
5683         }
5684         return 0;
5685 }
5686
5687 void DPSOFTRAST_Shutdown(void)
5688 {
5689         int i;
5690         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5691         {
5692                 DPSOFTRAST_State_Thread *thread;
5693                 for (i = 0; i < dpsoftrast.numthreads; i++)
5694                 {
5695                         thread = &dpsoftrast.threads[i];
5696                         Thread_LockMutex(thread->drawmutex);
5697                         thread->index = -1;
5698                         Thread_CondSignal(thread->drawcond);
5699                         Thread_UnlockMutex(thread->drawmutex);
5700                         Thread_WaitThread(thread->thread, 0);
5701                         Thread_DestroyCond(thread->waitcond);
5702                         Thread_DestroyCond(thread->drawcond);
5703                         Thread_DestroyMutex(thread->drawmutex);
5704                 }
5705         }
5706         for (i = 0;i < dpsoftrast.texture_end;i++)
5707                 if (dpsoftrast.texture[i].bytes)
5708                         MM_FREE(dpsoftrast.texture[i].bytes);
5709         if (dpsoftrast.texture)
5710                 free(dpsoftrast.texture);
5711         if (dpsoftrast.threads)
5712                 MM_FREE(dpsoftrast.threads);
5713         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5714 }
5715