]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
754cf685ddfe84d3a02d7f63a5dfa44bf5350b20
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int viewport[4];
253         int scissor[4];
254         float depthrange[2];
255         float polygonoffset[2];
256         float clipplane[4];
257         ALIGN(float fb_clipplane[4]);
258
259         int shader_mode;
260         int shader_permutation;
261         int shader_exactspecularmath;
262
263         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
264         
265         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
267
268         // DPSOFTRAST_VALIDATE_ flags
269         int validate;
270
271         // derived values (DPSOFTRAST_VALIDATE_FB)
272         int fb_colormask;
273         int fb_scissor[4];
274         ALIGN(float fb_viewportcenter[4]);
275         ALIGN(float fb_viewportscale[4]);
276
277         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278         int fb_depthfunc;
279
280         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
281         int fb_blendmode;
282
283         // band boundaries
284         int miny1;
285         int maxy1;
286         int miny2;
287         int maxy2;
288
289         ATOMIC(volatile int commandoffset);
290
291         volatile bool waiting;
292         volatile bool starving;
293         void *waitcond;
294         void *drawcond;
295         void *drawmutex;
296
297         int numspans;
298         int numtriangles;
299         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
302 }
303 DPSOFTRAST_State_Thread);
304
305 typedef ALIGN(struct DPSOFTRAST_State_s
306 {
307         int fb_width;
308         int fb_height;
309         unsigned int *fb_depthpixels;
310         unsigned int *fb_colorpixels[4];
311
312         int viewport[4];
313         ALIGN(float fb_viewportcenter[4]);
314         ALIGN(float fb_viewportscale[4]);
315
316         float color[4];
317         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
319
320         const float *pointer_vertex3f;
321         const float *pointer_color4f;
322         const unsigned char *pointer_color4ub;
323         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324         int stride_vertex;
325         int stride_color;
326         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
329
330         int firstvertex;
331         int numvertices;
332         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333         float *screencoord4f;
334         int drawstarty;
335         int drawendy;
336         int drawclipped;
337         
338         int shader_mode;
339         int shader_permutation;
340         int shader_exactspecularmath;
341
342         int texture_max;
343         int texture_end;
344         int texture_firstfree;
345         DPSOFTRAST_Texture *texture;
346
347         int bigendian;
348
349         // error reporting
350         const char *errorstring;
351
352         bool usethreads;
353         int interlace;
354         int numthreads;
355         DPSOFTRAST_State_Thread *threads;
356
357         ATOMIC(volatile int drawcommand);
358
359         DPSOFTRAST_State_Command_Pool commandpool;
360 }
361 DPSOFTRAST_State);
362
363 DPSOFTRAST_State dpsoftrast;
364
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
369
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
372
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
374 {
375         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377         fb_viewportcenter[3] = 0.5f;
378         fb_viewportcenter[0] = 0.0f;
379         fb_viewportscale[1] = 0.5f * viewport[2];
380         fb_viewportscale[2] = -0.5f * viewport[3];
381         fb_viewportscale[3] = 0.5f;
382         fb_viewportscale[0] = 1.0f;
383 }
384
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
386 {
387         if (dpsoftrast.interlace)
388         {
389                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393         }
394         else
395         {
396                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
398         }
399 }
400
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
402 {
403         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 }
409
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
411 {
412         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413         // and viewport projection values
414         int x1, x2;
415         int y1, y2;
416         x1 = thread->scissor[0];
417         x2 = thread->scissor[0] + thread->scissor[2];
418         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419         y2 = dpsoftrast.fb_height - thread->scissor[1];
420         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
421         if (x1 < 0) x1 = 0;
422         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
423         if (y1 < 0) y1 = 0;
424         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425         thread->fb_scissor[0] = x1;
426         thread->fb_scissor[1] = y1;
427         thread->fb_scissor[2] = x2 - x1;
428         thread->fb_scissor[3] = y2 - y1;
429
430         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431         DPSOFTRAST_RecalcClipPlane(thread);
432         DPSOFTRAST_RecalcThread(thread);
433 }
434
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
436 {
437         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 }
439
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
441 {
442         if (thread->blendsubtract)
443         {
444                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445                 {
446                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450                 }
451         }
452         else
453         {       
454                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
455                 {
456                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
467                 }
468         }
469 }
470
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
472
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
474 {
475         mask &= thread->validate;
476         if (!mask)
477                 return;
478         if (mask & DPSOFTRAST_VALIDATE_FB)
479         {
480                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481                 DPSOFTRAST_RecalcFB(thread);
482         }
483         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
484         {
485                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486                 DPSOFTRAST_RecalcDepthFunc(thread);
487         }
488         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
489         {
490                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491                 DPSOFTRAST_RecalcBlendFunc(thread);
492         }
493 }
494
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
496 {
497         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498                 return &dpsoftrast.texture[index];
499         return NULL;
500 }
501
502 static void DPSOFTRAST_Texture_Grow(void)
503 {
504         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505         DPSOFTRAST_State_Thread *thread;
506         int i;
507         int j;
508         DPSOFTRAST_Flush();
509         // expand texture array as needed
510         if (dpsoftrast.texture_max < 1024)
511                 dpsoftrast.texture_max = 1024;
512         else
513                 dpsoftrast.texture_max *= 2;
514         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516                 if (dpsoftrast.texbound[i])
517                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518         for (j = 0; j < dpsoftrast.numthreads; j++)
519         {
520                 thread = &dpsoftrast.threads[j];
521                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522                         if (thread->texbound[i])
523                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
524         }
525 }
526
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
528 {
529         int w;
530         int h;
531         int d;
532         int size;
533         int s;
534         int texnum;
535         int mipmaps;
536         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538         DPSOFTRAST_Texture *texture;
539         if (width*height*depth < 1)
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542                 return 0;
543         }
544         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
545         {
546                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
547                 return 0;
548         }
549         switch(texformat)
550         {
551         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
554                 break;
555         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
557                 {
558                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
559                         return 0;
560                 }
561                 if (depth != 1)
562                 {
563                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564                         return 0;
565                 }
566                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
567                 {
568                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
569                         return 0;
570                 }
571                 break;
572         }
573         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576                 return 0;
577         }
578         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581                 return 0;
582         }
583         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
584         {
585                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586                 return 0;
587         }
588         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
589         {
590                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591                 return 0;
592         }
593         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
594         {
595                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596                 return 0;
597         }
598         // find first empty slot in texture array
599         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600                 if (!dpsoftrast.texture[texnum].bytes)
601                         break;
602         dpsoftrast.texture_firstfree = texnum + 1;
603         if (dpsoftrast.texture_max <= texnum)
604                 DPSOFTRAST_Texture_Grow();
605         if (dpsoftrast.texture_end <= texnum)
606                 dpsoftrast.texture_end = texnum + 1;
607         texture = &dpsoftrast.texture[texnum];
608         memset(texture, 0, sizeof(*texture));
609         texture->flags = flags;
610         texture->width = width;
611         texture->height = height;
612         texture->depth = depth;
613         texture->sides = sides;
614         texture->binds = 0;
615         w = width;
616         h = height;
617         d = depth;
618         size = 0;
619         mipmaps = 0;
620         w = width;
621         h = height;
622         d = depth;
623         for (;;)
624         {
625                 s = w * h * d * sides * 4;
626                 texture->mipmap[mipmaps][0] = size;
627                 texture->mipmap[mipmaps][1] = s;
628                 texture->mipmap[mipmaps][2] = w;
629                 texture->mipmap[mipmaps][3] = h;
630                 texture->mipmap[mipmaps][4] = d;
631                 size += s;
632                 mipmaps++;
633                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
634                         break;
635                 if (w > 1) w >>= 1;
636                 if (h > 1) h >>= 1;
637                 if (d > 1) d >>= 1;
638         }
639         texture->mipmaps = mipmaps;
640         texture->size = size;
641
642         // allocate the pixels now
643         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
644
645         return texnum;
646 }
647 void DPSOFTRAST_Texture_Free(int index)
648 {
649         DPSOFTRAST_Texture *texture;
650         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
651         if (texture->binds)
652                 DPSOFTRAST_Flush();
653         if (texture->bytes)
654                 MM_FREE(texture->bytes);
655         texture->bytes = NULL;
656         memset(texture, 0, sizeof(*texture));
657         // adjust the free range and used range
658         if (dpsoftrast.texture_firstfree > index)
659                 dpsoftrast.texture_firstfree = index;
660         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661                 dpsoftrast.texture_end--;
662 }
663 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
664 {
665         int i, x, y, z, w, layer0, layer1, row0, row1;
666         unsigned char *o, *i0, *i1, *i2, *i3;
667         DPSOFTRAST_Texture *texture;
668         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669         if (texture->mipmaps <= 1)
670                 return;
671         for (i = 1;i < texture->mipmaps;i++)
672         {
673                 for (z = 0;z < texture->mipmap[i][4];z++)
674                 {
675                         layer0 = z*2;
676                         layer1 = z*2+1;
677                         if (layer1 >= texture->mipmap[i-1][4])
678                                 layer1 = texture->mipmap[i-1][4]-1;
679                         for (y = 0;y < texture->mipmap[i][3];y++)
680                         {
681                                 row0 = y*2;
682                                 row1 = y*2+1;
683                                 if (row1 >= texture->mipmap[i-1][3])
684                                         row1 = texture->mipmap[i-1][3]-1;
685                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
686                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690                                 w = texture->mipmap[i][2];
691                                 if (layer1 > layer0)
692                                 {
693                                         if (texture->mipmap[i-1][2] > 1)
694                                         {
695                                                 // average 3D texture
696                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
697                                                 {
698                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
702                                                 }
703                                         }
704                                         else
705                                         {
706                                                 // average 3D mipmap with parent width == 1
707                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
708                                                 {
709                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
713                                                 }
714                                         }
715                                 }
716                                 else
717                                 {
718                                         if (texture->mipmap[i-1][2] > 1)
719                                         {
720                                                 // average 2D texture (common case)
721                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
722                                                 {
723                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
727                                                 }
728                                         }
729                                         else
730                                         {
731                                                 // 2D texture with parent width == 1
732                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
733                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
734                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
735                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
736                                         }
737                                 }
738                         }
739                 }
740         }
741 }
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
743 {
744         DPSOFTRAST_Texture *texture;
745         unsigned char *dst;
746         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
747         if (texture->binds)
748                 DPSOFTRAST_Flush();
749         if (pixels)
750         {
751                 dst = texture->bytes + texture->mipmap[0][1] +(-blocky * texture->mipmap[0][2] + blockx) * 4;
752                 while (blockheight > 0)
753                 {
754                         dst -= texture->mipmap[0][2] * 4;
755                         memcpy(dst, pixels, blockwidth * 4);
756                         pixels += blockwidth * 4;
757                         blockheight--;
758                 }
759         }
760         DPSOFTRAST_Texture_CalculateMipmaps(index);
761 }
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
763 {
764         DPSOFTRAST_Texture *texture;
765         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
766         if (texture->binds)
767                 DPSOFTRAST_Flush();
768         if (pixels)
769         {
770                 int i, stride = texture->mipmap[0][2]*4;
771                 unsigned char *dst = texture->bytes + texture->mipmap[0][1];
772                 for (i = texture->mipmap[0][3];i > 0;i--)
773                 {
774                         dst -= stride;
775                         memcpy(dst, pixels, stride);
776                         pixels += stride;
777                 }
778         }
779         DPSOFTRAST_Texture_CalculateMipmaps(index);
780 }
781 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
782 {
783         DPSOFTRAST_Texture *texture;
784         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785         return texture->mipmap[mip][2];
786 }
787 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
788 {
789         DPSOFTRAST_Texture *texture;
790         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
791         return texture->mipmap[mip][3];
792 }
793 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
794 {
795         DPSOFTRAST_Texture *texture;
796         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
797         return texture->mipmap[mip][4];
798 }
799 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
800 {
801         DPSOFTRAST_Texture *texture;
802         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
803         if (texture->binds)
804                 DPSOFTRAST_Flush();
805         return texture->bytes + texture->mipmap[mip][0];
806 }
807 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
808 {
809         DPSOFTRAST_Texture *texture;
810         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
811         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
812         {
813                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
814                 return;
815         }
816         if (texture->binds)
817                 DPSOFTRAST_Flush();
818         texture->filter = filter;
819 }
820
821 static void DPSOFTRAST_Draw_FlushThreads(void);
822
823 static void DPSOFTRAST_Draw_SyncCommands(void)
824 {
825         if(dpsoftrast.usethreads) MEMORY_BARRIER;
826         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
827 }
828
829 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
830 {
831         DPSOFTRAST_State_Thread *thread;
832         int i;
833         int freecommand = dpsoftrast.commandpool.freecommand;
834         int usedcommands = dpsoftrast.commandpool.usedcommands;
835         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
836                 return;
837         DPSOFTRAST_Draw_SyncCommands();
838         for(;;)
839         {
840                 int waitindex = -1;
841                 int commandoffset;
842                 usedcommands = 0;
843                 for (i = 0; i < dpsoftrast.numthreads; i++)
844                 {
845                         thread = &dpsoftrast.threads[i]; 
846                         commandoffset = freecommand - thread->commandoffset;
847                         if (commandoffset < 0)
848                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
849                         if (commandoffset > usedcommands)
850                         {
851                                 waitindex = i;
852                                 usedcommands = commandoffset;
853                         }
854                 }
855                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
856                         break;
857                 thread = &dpsoftrast.threads[waitindex];
858                 Thread_LockMutex(thread->drawmutex);
859                 if (thread->commandoffset != dpsoftrast.drawcommand)
860                 {
861                         thread->waiting = true;
862                         if (thread->starving) Thread_CondSignal(thread->drawcond);
863                         Thread_CondWait(thread->waitcond, thread->drawmutex);
864                         thread->waiting = false;
865                 }
866                 Thread_UnlockMutex(thread->drawmutex);
867         }
868         dpsoftrast.commandpool.usedcommands = usedcommands;
869 }
870
871 #define DPSOFTRAST_ALIGNCOMMAND(size) \
872         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
873 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
874         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
875
876 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
877 {
878         DPSOFTRAST_Command *command;
879         int freecommand = dpsoftrast.commandpool.freecommand;
880         int usedcommands = dpsoftrast.commandpool.usedcommands;
881         int extra = sizeof(DPSOFTRAST_Command);
882         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
883                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
884         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
885         {
886                 if (dpsoftrast.usethreads)
887                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
888                 else
889                         DPSOFTRAST_Draw_FlushThreads();
890                 freecommand = dpsoftrast.commandpool.freecommand;
891                 usedcommands = dpsoftrast.commandpool.usedcommands;
892         }
893         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
894         {
895                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
896                 command->opcode = DPSOFTRAST_OPCODE_Reset;
897                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
898                 freecommand = 0;
899         }
900         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
901         command->opcode = opcode;
902         command->commandsize = size;
903         freecommand += size;
904         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
905                 freecommand = 0;
906         dpsoftrast.commandpool.freecommand = freecommand;
907         dpsoftrast.commandpool.usedcommands = usedcommands + size;
908         return command;
909 }
910
911 static void DPSOFTRAST_UndoCommand(int size)
912 {
913         int freecommand = dpsoftrast.commandpool.freecommand;
914         int usedcommands = dpsoftrast.commandpool.usedcommands;
915         freecommand -= size;
916         if (freecommand < 0)
917                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
918         usedcommands -= size;
919         dpsoftrast.commandpool.freecommand = freecommand;
920         dpsoftrast.commandpool.usedcommands = usedcommands;
921 }
922                 
923 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
924 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
925 {
926         thread->viewport[0] = command->x;
927         thread->viewport[1] = command->y;
928         thread->viewport[2] = command->width;
929         thread->viewport[3] = command->height;
930         thread->validate |= DPSOFTRAST_VALIDATE_FB;
931 }
932 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
933 {
934         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
935         command->x = x;
936         command->y = y;
937         command->width = width;
938         command->height = height;
939
940         dpsoftrast.viewport[0] = x;
941         dpsoftrast.viewport[1] = y;
942         dpsoftrast.viewport[2] = width;
943         dpsoftrast.viewport[3] = height;
944         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
945 }
946
947 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
948 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
949 {
950         int i, x1, y1, x2, y2, w, h, x, y;
951         int miny1, maxy1, miny2, maxy2;
952         int bandy;
953         unsigned int *p;
954         unsigned int c;
955         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
956         miny1 = thread->miny1;
957         maxy1 = thread->maxy1;
958         miny2 = thread->miny2;
959         maxy2 = thread->maxy2;
960         x1 = thread->fb_scissor[0];
961         y1 = thread->fb_scissor[1];
962         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
963         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
964         if (y1 < miny1) y1 = miny1;
965         if (y2 > maxy2) y2 = maxy2;
966         w = x2 - x1;
967         h = y2 - y1;
968         if (w < 1 || h < 1)
969                 return;
970         // FIXME: honor fb_colormask?
971         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
972         for (i = 0;i < 4;i++)
973         {
974                 if (!dpsoftrast.fb_colorpixels[i])
975                         continue;
976                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
977                 for (;y < bandy;y++)
978                 {
979                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
980                         for (x = x1;x < x2;x++)
981                                 p[x] = c;
982                 }
983         }
984 }
985 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
986 {
987         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
988         command->r = r;
989         command->g = g;
990         command->b = b;
991         command->a = a;
992 }
993
994 DEFCOMMAND(3, ClearDepth, float depth;)
995 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
996 {
997         int x1, y1, x2, y2, w, h, x, y;
998         int miny1, maxy1, miny2, maxy2;
999         int bandy;
1000         unsigned int *p;
1001         unsigned int c;
1002         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
1003         miny1 = thread->miny1;
1004         maxy1 = thread->maxy1;
1005         miny2 = thread->miny2;
1006         maxy2 = thread->maxy2;
1007         x1 = thread->fb_scissor[0];
1008         y1 = thread->fb_scissor[1];
1009         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1010         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1011         if (y1 < miny1) y1 = miny1;
1012         if (y2 > maxy2) y2 = maxy2;
1013         w = x2 - x1;
1014         h = y2 - y1;
1015         if (w < 1 || h < 1)
1016                 return;
1017         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1018         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1019         for (;y < bandy;y++)
1020         {
1021                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1022                 for (x = x1;x < x2;x++)
1023                         p[x] = c;
1024         }
1025 }
1026 void DPSOFTRAST_ClearDepth(float d)
1027 {
1028         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1029         command->depth = d;
1030 }
1031
1032 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1033 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1034 {
1035         thread->colormask[0] = command->r != 0;
1036         thread->colormask[1] = command->g != 0;
1037         thread->colormask[2] = command->b != 0;
1038         thread->colormask[3] = command->a != 0;
1039         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1040 }
1041 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1042 {
1043         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1044         command->r = r;
1045         command->g = g;
1046         command->b = b;
1047         command->a = a;
1048 }
1049
1050 DEFCOMMAND(5, DepthTest, int enable;)
1051 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1052 {
1053         thread->depthtest = command->enable;
1054         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1055 }
1056 void DPSOFTRAST_DepthTest(int enable)
1057 {
1058         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1059         command->enable = enable;
1060 }
1061
1062 DEFCOMMAND(6, ScissorTest, int enable;)
1063 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1064 {
1065         thread->scissortest = command->enable;
1066         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1067 }
1068 void DPSOFTRAST_ScissorTest(int enable)
1069 {
1070         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1071         command->enable = enable;
1072 }
1073
1074 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1075 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1076 {
1077         thread->scissor[0] = command->x;
1078         thread->scissor[1] = command->y;
1079         thread->scissor[2] = command->width;
1080         thread->scissor[3] = command->height;
1081         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1082 }
1083 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1084 {
1085         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1086         command->x = x;
1087         command->y = y;
1088         command->width = width;
1089         command->height = height;
1090 }
1091
1092 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1093 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1094 {
1095         thread->blendfunc[0] = command->sfactor;
1096         thread->blendfunc[1] = command->dfactor;
1097         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1098 }
1099 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1100 {
1101         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1102         command->sfactor = sfactor;
1103         command->dfactor = dfactor;
1104 }
1105
1106 DEFCOMMAND(9, BlendSubtract, int enable;)
1107 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1108 {
1109         thread->blendsubtract = command->enable;
1110         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1111 }
1112 void DPSOFTRAST_BlendSubtract(int enable)
1113 {
1114         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1115         command->enable = enable;
1116 }
1117
1118 DEFCOMMAND(10, DepthMask, int enable;)
1119 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1120 {
1121         thread->depthmask = command->enable;
1122 }
1123 void DPSOFTRAST_DepthMask(int enable)
1124 {
1125         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1126         command->enable = enable;
1127 }
1128
1129 DEFCOMMAND(11, DepthFunc, int func;)
1130 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1131 {
1132         thread->depthfunc = command->func;
1133 }
1134 void DPSOFTRAST_DepthFunc(int func)
1135 {
1136         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1137         command->func = func;
1138 }
1139
1140 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1141 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1142 {
1143         thread->depthrange[0] = command->nearval;
1144         thread->depthrange[1] = command->farval;
1145 }
1146 void DPSOFTRAST_DepthRange(float nearval, float farval)
1147 {
1148         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1149         command->nearval = nearval;
1150         command->farval = farval;
1151 }
1152
1153 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1154 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1155 {
1156         thread->polygonoffset[0] = command->alongnormal;
1157         thread->polygonoffset[1] = command->intoview;
1158 }
1159 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1160 {
1161         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1162         command->alongnormal = alongnormal;
1163         command->intoview = intoview;
1164 }
1165
1166 DEFCOMMAND(14, CullFace, int mode;)
1167 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1168 {
1169         thread->cullface = command->mode;
1170 }
1171 void DPSOFTRAST_CullFace(int mode)
1172 {
1173         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1174         command->mode = mode;
1175 }
1176
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1178 {
1179         dpsoftrast.color[0] = r;
1180         dpsoftrast.color[1] = g;
1181         dpsoftrast.color[2] = b;
1182         dpsoftrast.color[3] = a;
1183 }
1184
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1186 {
1187         int outstride = blockwidth * 4;
1188         int instride = dpsoftrast.fb_width * 4;
1189         int bx1 = blockx;
1190         int by1 = blocky;
1191         int bx2 = blockx + blockwidth;
1192         int by2 = blocky + blockheight;
1193         int bw;
1194         int x;
1195         int y;
1196         unsigned char *inpixels;
1197         unsigned char *b;
1198         unsigned char *o;
1199         DPSOFTRAST_Flush();
1200         if (bx1 < 0) bx1 = 0;
1201         if (by1 < 0) by1 = 0;
1202         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1204         bw = bx2 - bx1;
1205         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206         if (dpsoftrast.bigendian)
1207         {
1208                 for (y = by1;y < by2;y++)
1209                 {
1210                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1212                         for (x = bx1;x < bx2;x++)
1213                         {
1214                                 o[0] = b[3];
1215                                 o[1] = b[2];
1216                                 o[2] = b[1];
1217                                 o[3] = b[0];
1218                                 o += 4;
1219                                 b += 4;
1220                         }
1221                 }
1222         }
1223         else
1224         {
1225                 for (y = by1;y < by2;y++)
1226                 {
1227                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1229                         memcpy(o, b, bw*4);
1230                 }
1231         }
1232
1233 }
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1235 {
1236         int tx1 = tx;
1237         int ty1 = ty;
1238         int tx2 = tx + width;
1239         int ty2 = ty + height;
1240         int sx1 = sx;
1241         int sy1 = sy;
1242         int sx2 = sx + width;
1243         int sy2 = sy + height;
1244         int swidth;
1245         int sheight;
1246         int twidth;
1247         int theight;
1248         int sw;
1249         int sh;
1250         int tw;
1251         int th;
1252         int y;
1253         unsigned int *spixels;
1254         unsigned int *tpixels;
1255         DPSOFTRAST_Texture *texture;
1256         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257         if (mip < 0 || mip >= texture->mipmaps) return;
1258         DPSOFTRAST_Flush();
1259         spixels = dpsoftrast.fb_colorpixels[0];
1260         swidth = dpsoftrast.fb_width;
1261         sheight = dpsoftrast.fb_height;
1262         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263         twidth = texture->mipmap[mip][2];
1264         theight = texture->mipmap[mip][3];
1265         if (tx1 < 0) tx1 = 0;
1266         if (ty1 < 0) ty1 = 0;
1267         if (tx2 > twidth) tx2 = twidth;
1268         if (ty2 > theight) ty2 = theight;
1269         if (sx1 < 0) sx1 = 0;
1270         if (sy1 < 0) sy1 = 0;
1271         if (sx2 > swidth) sx2 = swidth;
1272         if (sy2 > sheight) sy2 = sheight;
1273         tw = tx2 - tx1;
1274         th = ty2 - ty1;
1275         sw = sx2 - sx1;
1276         sh = sy2 - sy1;
1277         if (tw > sw) tw = sw;
1278         if (th > sh) th = sh;
1279         if (tw < 1 || th < 1)
1280                 return;
1281         sy1 = sheight - sy1 - th;
1282         ty1 = theight - ty1 - th;
1283         for (y = 0;y < th;y++)
1284                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1285         if (texture->mipmaps > 1)
1286                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1287 }
1288
1289 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1290 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1291 {
1292         if (thread->texbound[command->unitnum])
1293                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1294         thread->texbound[command->unitnum] = command->texture;
1295 }
1296 void DPSOFTRAST_SetTexture(int unitnum, int index)
1297 {
1298         DPSOFTRAST_Command_SetTexture *command;
1299         DPSOFTRAST_Texture *texture;
1300         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1301         {
1302                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1303                 return;
1304         }
1305         texture = DPSOFTRAST_Texture_GetByIndex(index);
1306         if (index && !texture)
1307         {
1308                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1309                 return;
1310         }
1311
1312         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1313         command->unitnum = unitnum;
1314         command->texture = texture;
1315
1316         dpsoftrast.texbound[unitnum] = texture;
1317         if (texture)
1318                 ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1319 }
1320
1321 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1322 {
1323         dpsoftrast.pointer_vertex3f = vertex3f;
1324         dpsoftrast.stride_vertex = stride;
1325 }
1326 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1327 {
1328         dpsoftrast.pointer_color4f = color4f;
1329         dpsoftrast.pointer_color4ub = NULL;
1330         dpsoftrast.stride_color = stride;
1331 }
1332 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1333 {
1334         dpsoftrast.pointer_color4f = NULL;
1335         dpsoftrast.pointer_color4ub = color4ub;
1336         dpsoftrast.stride_color = stride;
1337 }
1338 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1339 {
1340         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1341         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1342         dpsoftrast.stride_texcoord[unitnum] = stride;
1343 }
1344
1345 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1346 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1347 {
1348         thread->shader_mode = command->mode;
1349         thread->shader_permutation = command->permutation;
1350         thread->shader_exactspecularmath = command->exactspecularmath;
1351 }
1352 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1353 {
1354         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1355         command->mode = mode;
1356         command->permutation = permutation;
1357         command->exactspecularmath = exactspecularmath;
1358
1359         dpsoftrast.shader_mode = mode;
1360         dpsoftrast.shader_permutation = permutation;
1361         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1362 }
1363
1364 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1365 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1366 {
1367         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1368 }
1369 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1370 {
1371         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1372         command->index = index;
1373         command->val[0] = v0;
1374         command->val[1] = v1;
1375         command->val[2] = v2;
1376         command->val[3] = v3;
1377
1378         dpsoftrast.uniform4f[index*4+0] = v0;
1379         dpsoftrast.uniform4f[index*4+1] = v1;
1380         dpsoftrast.uniform4f[index*4+2] = v2;
1381         dpsoftrast.uniform4f[index*4+3] = v3;
1382 }
1383 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1384 {
1385         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1386         command->index = index;
1387         memcpy(command->val, v, sizeof(command->val));
1388
1389         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1390 }
1391
1392 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1393 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1394 {
1395         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1396 }
1397 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1398 {
1399 #ifdef SSE_POSSIBLE
1400         int i, index;
1401         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1402         {
1403                 __m128 m0, m1, m2, m3;
1404                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1405                 command->index = (DPSOFTRAST_UNIFORM)index;
1406                 if (((size_t)v)&(ALIGN_SIZE-1))
1407                 {
1408                         m0 = _mm_loadu_ps(v);
1409                         m1 = _mm_loadu_ps(v+4);
1410                         m2 = _mm_loadu_ps(v+8);
1411                         m3 = _mm_loadu_ps(v+12);
1412                 }
1413                 else
1414                 {
1415                         m0 = _mm_load_ps(v);
1416                         m1 = _mm_load_ps(v+4);
1417                         m2 = _mm_load_ps(v+8);
1418                         m3 = _mm_load_ps(v+12);
1419                 }
1420                 if (transpose)
1421                 {
1422                         __m128 t0, t1, t2, t3;
1423                         t0 = _mm_unpacklo_ps(m0, m1);
1424                         t1 = _mm_unpacklo_ps(m2, m3);
1425                         t2 = _mm_unpackhi_ps(m0, m1);
1426                         t3 = _mm_unpackhi_ps(m2, m3);
1427                         m0 = _mm_movelh_ps(t0, t1);
1428                         m1 = _mm_movehl_ps(t1, t0);
1429                         m2 = _mm_movelh_ps(t2, t3);
1430                         m3 = _mm_movehl_ps(t3, t2);                     
1431                 }
1432                 _mm_store_ps(command->val, m0);
1433                 _mm_store_ps(command->val+4, m1);
1434                 _mm_store_ps(command->val+8, m2);
1435                 _mm_store_ps(command->val+12, m3);
1436                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1437                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1438                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1439                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1440         }
1441 #endif
1442 }
1443
1444 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1445 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1446 {
1447         thread->uniform1i[command->index] = command->val;
1448 }
1449 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1450 {
1451         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1452         command->index = index;
1453         command->val = i0;
1454
1455         dpsoftrast.uniform1i[command->index] = i0;
1456 }
1457
1458 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1459 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1460 {
1461         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1462         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1463 }
1464 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1465 {
1466         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1467         command->clipplane[0] = x;
1468         command->clipplane[1] = y;
1469         command->clipplane[2] = z;
1470         command->clipplane[3] = w;
1471 }
1472
1473 #ifdef SSE_POSSIBLE
1474 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1475 {
1476         float *end = dst + size*4;
1477         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1478         {
1479                 while (dst < end)
1480                 {
1481                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1482                         dst += 4;
1483                         src += stride;
1484                 }
1485         }
1486         else
1487         {
1488                 while (dst < end)
1489                 {
1490                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1491                         dst += 4;
1492                         src += stride;
1493                 }
1494         }
1495 }
1496
1497 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1498 {
1499         float *end = dst + size*4;
1500         if (stride == sizeof(float[3]))
1501         {
1502                 float *end4 = dst + (size&~3)*4;        
1503                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1504                 {
1505                         while (dst < end4)
1506                         {
1507                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1508                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1509                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1510                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1512                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1513                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1514                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1515                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1516                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1517                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1518                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1519                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520                                 dst += 16;
1521                                 src += 4*sizeof(float[3]);
1522                         }
1523                 }
1524                 else
1525                 {
1526                         while (dst < end4)
1527                         {
1528                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1529                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1530                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1531                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1533                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1534                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1535                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1536                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1537                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1538                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1539                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1540                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1541                                 dst += 16;
1542                                 src += 4*sizeof(float[3]);
1543                         }
1544                 }
1545         }
1546         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1547         {
1548                 while (dst < end)
1549                 {
1550                         __m128 v = _mm_loadu_ps((const float *)src);
1551                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1552                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1553                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1554                         _mm_store_ps(dst, v);
1555                         dst += 4;
1556                         src += stride;
1557                 }
1558         }
1559         else
1560         {
1561                 while (dst < end)
1562                 {
1563                         __m128 v = _mm_load_ps((const float *)src);
1564                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1565                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1566                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1567                         _mm_store_ps(dst, v);
1568                         dst += 4;
1569                         src += stride;
1570                 }
1571         }
1572 }
1573
1574 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1575 {
1576         float *end = dst + size*4;
1577         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1578         if (stride == sizeof(float[2]))
1579         {
1580                 float *end2 = dst + (size&~1)*4;
1581                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1582                 {
1583                         while (dst < end2)
1584                         {
1585                                 __m128 v = _mm_loadu_ps((const float *)src);
1586                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1587                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1588                                 dst += 8;
1589                                 src += 2*sizeof(float[2]);
1590                         }
1591                 }
1592                 else
1593                 {
1594                         while (dst < end2)
1595                         {
1596                                 __m128 v = _mm_load_ps((const float *)src);
1597                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1598                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1599                                 dst += 8;
1600                                 src += 2*sizeof(float[2]);
1601                         }
1602                 }
1603         }
1604         while (dst < end)
1605         {
1606                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1607                 dst += 4;
1608                 src += stride;
1609         }
1610 }
1611
1612 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1613 {
1614         float *end = dst + size*4;
1615         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1616         if (stride == sizeof(unsigned char[4]))
1617         {
1618                 float *end4 = dst + (size&~3)*4;
1619                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1620                 {
1621                         while (dst < end4)
1622                         {
1623                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1624                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1625                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1626                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1627                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1628                                 dst += 16;
1629                                 src += 4*sizeof(unsigned char[4]);
1630                         }
1631                 }
1632                 else
1633                 {
1634                         while (dst < end4)
1635                         {
1636                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1637                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1638                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1639                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1640                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1641                                 dst += 16;
1642                                 src += 4*sizeof(unsigned char[4]);
1643                         }
1644                 }
1645         }
1646         while (dst < end)
1647         {
1648                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1649                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1650                 dst += 4;
1651                 src += stride;
1652         }
1653 }
1654
1655 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1656 {
1657         float *end = dst + 4*size;
1658         __m128 v = _mm_loadu_ps(src);
1659         while (dst < end)
1660         {
1661                 _mm_store_ps(dst, v);
1662                 dst += 4;
1663         }
1664 }
1665 #endif
1666
1667 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1668 {
1669 #ifdef SSE_POSSIBLE
1670         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1671         __m128 m0, m1, m2, m3;
1672         float *end;
1673         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1674         {
1675                 // fast case for identity matrix
1676                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1677                 return;
1678         }
1679         end = out4f + numitems*4;
1680         m0 = _mm_loadu_ps(inmatrix16f);
1681         m1 = _mm_loadu_ps(inmatrix16f + 4);
1682         m2 = _mm_loadu_ps(inmatrix16f + 8);
1683         m3 = _mm_loadu_ps(inmatrix16f + 12);
1684         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1685         {
1686                 while (out4f < end)
1687                 {
1688                         __m128 v = _mm_loadu_ps(in4f);
1689                         _mm_store_ps(out4f,
1690                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1691                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1692                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1693                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1694                         out4f += 4;
1695                         in4f += 4;
1696                 }
1697         }
1698         else
1699         {
1700                 while (out4f < end)
1701                 {
1702                         __m128 v = _mm_load_ps(in4f);
1703                         _mm_store_ps(out4f,
1704                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1705                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1706                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1707                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1708                         out4f += 4;
1709                         in4f += 4;
1710                 }
1711         }
1712 #endif
1713 }
1714
1715 #if 0
1716 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1717 {
1718         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1719 }
1720 #endif
1721
1722 #ifdef SSE_POSSIBLE
1723 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1724 { \
1725         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1726         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1727         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1728         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1729 }
1730
1731 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1732 { \
1733         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1734         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1735         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1736         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1737 }
1738
1739 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1740 { \
1741         __m128 p = (in); \
1742         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1743                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1744                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1745                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1746 }
1747
1748 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1749 {
1750         int clipmask = 0xFF;
1751         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1752         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1753         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1754         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1755         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1756         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1757         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1758         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1759         #define BBFRONT(k, pos) \
1760         { \
1761                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1762                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1763                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1764                 { \
1765                         __m128 proj; \
1766                         clipmask &= ~(1<<k); \
1767                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1768                         minproj = _mm_min_ss(minproj, proj); \
1769                         maxproj = _mm_max_ss(maxproj, proj); \
1770                 } \
1771         }
1772         BBFRONT(0, minpos); 
1773         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1774         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1775         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1776         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1777         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1778         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1779         BBFRONT(7, maxpos);
1780         #define BBCLIP(k) \
1781         { \
1782                 if (clipmask&(1<<k)) \
1783                 { \
1784                         if (!(clipmask&(1<<(k^1)))) \
1785                         { \
1786                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1787                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1788                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1789                                 minproj = _mm_min_ss(minproj, proj); \
1790                                 maxproj = _mm_max_ss(maxproj, proj); \
1791                         } \
1792                         if (!(clipmask&(1<<(k^2)))) \
1793                         { \
1794                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1795                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1796                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1797                                 minproj = _mm_min_ss(minproj, proj); \
1798                                 maxproj = _mm_max_ss(maxproj, proj); \
1799                         } \
1800                         if (!(clipmask&(1<<(k^4)))) \
1801                         { \
1802                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1803                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1804                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1805                                 minproj = _mm_min_ss(minproj, proj); \
1806                                 maxproj = _mm_max_ss(maxproj, proj); \
1807                         } \
1808                 } \
1809         }
1810         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1811         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1812         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1813         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1814         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1815         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1816         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1817         *starty = _mm_cvttss_si32(maxproj);
1818         *endy = _mm_cvttss_si32(minproj)+1;
1819         return clipmask;
1820 }
1821         
1822 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1823 {
1824         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1825         float *end = out4f + numitems*4;
1826         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1827         __m128 minpos, maxpos;
1828         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1829         {
1830                 minpos = maxpos = _mm_loadu_ps(in4f);
1831                 while (out4f < end)
1832                 {
1833                         __m128 v = _mm_loadu_ps(in4f);
1834                         minpos = _mm_min_ps(minpos, v);
1835                         maxpos = _mm_max_ps(maxpos, v);
1836                         _mm_store_ps(out4f, v);
1837                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1838                         _mm_store_ps(screen4f, v);
1839                         in4f += 4;
1840                         out4f += 4;
1841                         screen4f += 4;
1842                 }
1843         }
1844         else
1845         {
1846                 minpos = maxpos = _mm_load_ps(in4f);
1847                 while (out4f < end)
1848                 {
1849                         __m128 v = _mm_load_ps(in4f);
1850                         minpos = _mm_min_ps(minpos, v);
1851                         maxpos = _mm_max_ps(maxpos, v);
1852                         _mm_store_ps(out4f, v);
1853                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1854                         _mm_store_ps(screen4f, v);
1855                         in4f += 4;
1856                         out4f += 4;
1857                         screen4f += 4;
1858                 }
1859         }
1860         if (starty && endy) 
1861         {
1862                 ALIGN(float minposf[4]);
1863                 ALIGN(float maxposf[4]);
1864                 _mm_store_ps(minposf, minpos);
1865                 _mm_store_ps(maxposf, maxpos);
1866                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1867         }
1868         return 0;
1869 }
1870
1871 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1872 {
1873         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1874         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1875         float *end;
1876         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1877                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1878         end = out4f + numitems*4;
1879         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1880         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1881         m0 = _mm_loadu_ps(inmatrix16f);
1882         m1 = _mm_loadu_ps(inmatrix16f + 4);
1883         m2 = _mm_loadu_ps(inmatrix16f + 8);
1884         m3 = _mm_loadu_ps(inmatrix16f + 12);
1885         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1886         {
1887                 minpos = maxpos = _mm_loadu_ps(in4f);
1888                 while (out4f < end)
1889                 {
1890                         __m128 v = _mm_loadu_ps(in4f);
1891                         minpos = _mm_min_ps(minpos, v);
1892                         maxpos = _mm_max_ps(maxpos, v);
1893                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894                         _mm_store_ps(out4f, v);
1895                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896                         _mm_store_ps(screen4f, v);
1897                         in4f += 4;
1898                         out4f += 4;
1899                         screen4f += 4;
1900                 }
1901         }
1902         else
1903         {
1904                 minpos = maxpos = _mm_load_ps(in4f);
1905                 while (out4f < end)
1906                 {
1907                         __m128 v = _mm_load_ps(in4f);
1908                         minpos = _mm_min_ps(minpos, v);
1909                         maxpos = _mm_max_ps(maxpos, v);
1910                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1911                         _mm_store_ps(out4f, v);
1912                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1913                         _mm_store_ps(screen4f, v);
1914                         in4f += 4;
1915                         out4f += 4;
1916                         screen4f += 4;
1917                 }
1918         }
1919         if (starty && endy) 
1920         {
1921                 ALIGN(float minposf[4]);
1922                 ALIGN(float maxposf[4]);
1923                 _mm_store_ps(minposf, minpos);
1924                 _mm_store_ps(maxposf, maxpos);
1925                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1926         }
1927         return 0;
1928 }
1929 #endif
1930
1931 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1932 {
1933 #ifdef SSE_POSSIBLE
1934         float *outf = dpsoftrast.post_array4f[outarray];
1935         const unsigned char *inb;
1936         int firstvertex = dpsoftrast.firstvertex;
1937         int numvertices = dpsoftrast.numvertices;
1938         int stride;
1939         switch(inarray)
1940         {
1941         case DPSOFTRAST_ARRAY_POSITION:
1942                 stride = dpsoftrast.stride_vertex;
1943                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1944                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1945                 break;
1946         case DPSOFTRAST_ARRAY_COLOR:
1947                 stride = dpsoftrast.stride_color;
1948                 if (dpsoftrast.pointer_color4f)
1949                 {
1950                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1951                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1952                 }
1953                 else if (dpsoftrast.pointer_color4ub)
1954                 {
1955                         stride = dpsoftrast.stride_color;
1956                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1957                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1958                 }
1959                 else
1960                 {
1961                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1962                 }
1963                 break;
1964         default:
1965                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1966                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1967                 {
1968                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1969                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1970                         {
1971                         case 2:
1972                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1973                                 break;
1974                         case 3:
1975                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1976                                 break;
1977                         case 4:
1978                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1979                                 break;
1980                         }
1981                 }
1982                 break;
1983         }
1984         return outf;
1985 #else
1986         return NULL;
1987 #endif
1988 }
1989
1990 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1991 {
1992         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1993         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1994         return data;
1995 }
1996
1997 #if 0
1998 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1999 {
2000 #ifdef SSE_POSSIBLE
2001         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2003         return data;
2004 #else
2005         return NULL;
2006 #endif
2007 }
2008 #endif
2009
2010 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2011 {
2012 #ifdef SSE_POSSIBLE
2013         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2014         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2015         return data;
2016 #else
2017         return NULL;
2018 #endif
2019 }
2020
2021 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2022 {
2023         int x;
2024         int startx = span->startx;
2025         int endx = span->endx;
2026         float wslope = triangle->w[0];
2027         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2028         float endz = 1.0f / (w + wslope * startx);
2029         if (triangle->w[0] == 0)
2030         {
2031                 // LordHavoc: fast flat polygons (HUD/menu)
2032                 for (x = startx;x < endx;x++)
2033                         zf[x] = endz;
2034                 return;
2035         }
2036         for (x = startx;x < endx;)
2037         {
2038                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2039                 float z = endz, dz;
2040                 if (nextsub >= endx) nextsub = endsub = endx-1;
2041                 endz = 1.0f / (w + wslope * nextsub);
2042                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2043                 for (; x <= endsub; x++, z += dz)
2044                         zf[x] = z;
2045         }
2046 }
2047
2048 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2049 {
2050 #ifdef SSE_POSSIBLE
2051         int x;
2052         int startx = span->startx;
2053         int endx = span->endx;
2054         int maskx;
2055         int subx;
2056         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2057         unsigned char * RESTRICT pixelmask = span->pixelmask;
2058         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2059         if (!pixeli)
2060                 return;
2061         pixeli += span->y * dpsoftrast.fb_width + span->x;
2062         // handle alphatest now (this affects depth writes too)
2063         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2064                 for (x = startx;x < endx;x++)
2065                         if (in4ub[x*4+3] < 128)
2066                                 pixelmask[x] = false;
2067         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2068         // helps sprites, text and hud artwork
2069         switch(thread->fb_blendmode)
2070         {
2071         case DPSOFTRAST_BLENDMODE_ALPHA:
2072         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2073         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2074                 maskx = startx;
2075                 for (x = startx;x < endx;x++)
2076                 {
2077                         if (in4ub[x*4+3] >= 1)
2078                         {
2079                                 startx = x;
2080                                 for (;;)
2081                                 {
2082                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2083                                         maskx = x;
2084                                         if (x >= endx) break;
2085                                         ++x;
2086                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2087                                         if (x >= endx) break;
2088                                 }
2089                                 break;
2090                         }
2091                 }
2092                 endx = maskx;
2093                 break;
2094         case DPSOFTRAST_BLENDMODE_OPAQUE:
2095         case DPSOFTRAST_BLENDMODE_ADD:
2096         case DPSOFTRAST_BLENDMODE_INVMOD:
2097         case DPSOFTRAST_BLENDMODE_MUL:
2098         case DPSOFTRAST_BLENDMODE_MUL2:
2099         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2100         case DPSOFTRAST_BLENDMODE_INVADD:
2101                 break;
2102         }
2103         // put some special values at the end of the mask to ensure the loops end
2104         pixelmask[endx] = 1;
2105         pixelmask[endx+1] = 0;
2106         // LordHavoc: use a double loop to identify subspans, this helps the
2107         // optimized copy/blend loops to perform at their best, most triangles
2108         // have only one run of pixels, and do the search using wide reads...
2109         x = startx;
2110         while (x < endx)
2111         {
2112                 // if this pixel is masked off, it's probably not alone...
2113                 if (!pixelmask[x])
2114                 {
2115                         x++;
2116 #if 1
2117                         if (x + 8 < endx)
2118                         {
2119                                 // the 4-item search must be aligned or else it stalls badly
2120                                 if ((x & 3) && !pixelmask[x]) 
2121                                 {
2122                                         if(pixelmask[x]) goto endmasked;
2123                                         x++;
2124                                         if (x & 3)
2125                                         {
2126                                                 if(pixelmask[x]) goto endmasked;
2127                                                 x++;
2128                                                 if (x & 3)
2129                                                 {
2130                                                         if(pixelmask[x]) goto endmasked;
2131                                                         x++;
2132                                                 }
2133                                         }
2134                                 }
2135                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2136                                         x += 4;
2137                         }
2138 #endif
2139                         for (;!pixelmask[x];x++)
2140                                 ;
2141                         // rather than continue the loop, just check the end variable
2142                         if (x >= endx)
2143                                 break;
2144                 }
2145         endmasked:
2146                 // find length of subspan
2147                 subx = x + 1;
2148 #if 1
2149                 if (subx + 8 < endx)
2150                 {
2151                         if (subx & 3)
2152                         {
2153                                 if(!pixelmask[subx]) goto endunmasked;
2154                                 subx++;
2155                                 if (subx & 3)
2156                                 {
2157                                         if(!pixelmask[subx]) goto endunmasked;
2158                                         subx++;
2159                                         if (subx & 3)
2160                                         {
2161                                                 if(!pixelmask[subx]) goto endunmasked;
2162                                                 subx++;
2163                                         }
2164                                 }
2165                         }
2166                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2167                                 subx += 4;
2168                 }
2169 #endif
2170                 for (;pixelmask[subx];subx++)
2171                         ;
2172                 // the checks can overshoot, so make sure to clip it...
2173                 if (subx > endx)
2174                         subx = endx;
2175         endunmasked:
2176                 // now that we know the subspan length...  process!
2177                 switch(thread->fb_blendmode)
2178                 {
2179                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2180 #if 0
2181                         if (subx - x >= 16)
2182                         {
2183                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2184                                 x = subx;
2185                         }
2186                         else
2187 #elif 1
2188                         while (x + 16 <= subx)
2189                         {
2190                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2191                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2192                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2193                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2194                                 x += 16;
2195                         }
2196 #endif
2197                         {
2198                                 while (x + 4 <= subx)
2199                                 {
2200                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2201                                         x += 4;
2202                                 }
2203                                 if (x + 2 <= subx)
2204                                 {
2205                                         pixeli[x] = ini[x];
2206                                         pixeli[x+1] = ini[x+1];
2207                                         x += 2;
2208                                 }
2209                                 if (x < subx)
2210                                 {
2211                                         pixeli[x] = ini[x];
2212                                         x++;
2213                                 }
2214                         }
2215                         break;
2216                 case DPSOFTRAST_BLENDMODE_ALPHA:
2217                 #define FINISHBLEND(blend2, blend1) \
2218                         for (;x + 1 < subx;x += 2) \
2219                         { \
2220                                 __m128i src, dst; \
2221                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2222                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2223                                 blend2; \
2224                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2225                         } \
2226                         if (x < subx) \
2227                         { \
2228                                 __m128i src, dst; \
2229                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2230                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2231                                 blend1; \
2232                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2233                                 x++; \
2234                         }
2235                         FINISHBLEND({
2236                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2238                         }, {
2239                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2241                         });
2242                         break;
2243                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2244                         FINISHBLEND({
2245                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2246                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2247                         }, {
2248                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2249                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2250                         });
2251                         break;
2252                 case DPSOFTRAST_BLENDMODE_ADD:
2253                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2254                         break;
2255                 case DPSOFTRAST_BLENDMODE_INVMOD:
2256                         FINISHBLEND({
2257                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2258                         }, {
2259                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2260                         });
2261                         break;
2262                 case DPSOFTRAST_BLENDMODE_MUL:
2263                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2264                         break;
2265                 case DPSOFTRAST_BLENDMODE_MUL2:
2266                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2267                         break;
2268                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2269                         FINISHBLEND({
2270                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2272                         }, {
2273                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2275                         });
2276                         break;
2277                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2278                         FINISHBLEND({
2279                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2280                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2281                         }, {
2282                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2283                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2284                         });
2285                         break;
2286                 case DPSOFTRAST_BLENDMODE_INVADD:
2287                         FINISHBLEND({
2288                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2289                         }, {
2290                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2291                         });
2292                         break;
2293                 }
2294         }
2295 #endif
2296 }
2297
2298 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2299         // warning: this is SLOW, only use if the optimized per-span functions won't do
2300 {
2301         const unsigned char * RESTRICT pixelbase;
2302         const unsigned char * RESTRICT pixel[4];
2303         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2304         int wrapmask[2] = { width-1, height-1 };
2305         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*width;
2306         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2307         {
2308                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2309                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2310                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2311                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2312                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2313                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2314                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2315                 {
2316                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2317                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2318                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2319                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2320                 }
2321                 else
2322                 {
2323                         tci[0] &= wrapmask[0];
2324                         tci[1] &= wrapmask[1];
2325                         tci1[0] &= wrapmask[0];
2326                         tci1[1] &= wrapmask[1];
2327                 }
2328                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2329                 pixel[1] = pixelbase + 4 * (tci[0] - tci[1]*width);
2330                 pixel[2] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2331                 pixel[3] = pixelbase + 4 * (tci[0] - tci1[1]*width);
2332                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2333                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2334                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2335                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2336         }
2337         else
2338         {
2339                 int tci[2] = { x * width, y * height };
2340                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2341                 {
2342                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2343                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2344                 }
2345                 else
2346                 {
2347                         tci[0] &= wrapmask[0];
2348                         tci[1] &= wrapmask[1];
2349                 }
2350                 pixel[0] = pixelbase + 4 * (tci[0] - tci[1]*width);
2351                 c[0] = pixel[0][0];
2352                 c[1] = pixel[0][1];
2353                 c[2] = pixel[0][2];
2354                 c[3] = pixel[0][3];
2355         }
2356 }
2357
2358 #if 0
2359 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2360 {
2361         int x;
2362         int startx = span->startx;
2363         int endx = span->endx;
2364         int flags;
2365         float c[4];
2366         float data[4];
2367         float slope[4];
2368         float tc[2], endtc[2];
2369         float tcscale[2];
2370         unsigned int tci[2];
2371         unsigned int tci1[2];
2372         unsigned int tcimin[2];
2373         unsigned int tcimax[2];
2374         int tciwrapmask[2];
2375         int tciwidth;
2376         int filter;
2377         int mip;
2378         const unsigned char * RESTRICT pixelbase;
2379         const unsigned char * RESTRICT pixel[4];
2380         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2381         // if no texture is bound, just fill it with white
2382         if (!texture)
2383         {
2384                 for (x = startx;x < endx;x++)
2385                 {
2386                         out4f[x*4+0] = 1.0f;
2387                         out4f[x*4+1] = 1.0f;
2388                         out4f[x*4+2] = 1.0f;
2389                         out4f[x*4+3] = 1.0f;
2390                 }
2391                 return;
2392         }
2393         mip = triangle->mip[texunitindex];
2394         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2395         // if this mipmap of the texture is 1 pixel, just fill it with that color
2396         if (texture->mipmap[mip][1] == 4)
2397         {
2398                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2399                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2400                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2401                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2402                 for (x = startx;x < endx;x++)
2403                 {
2404                         out4f[x*4+0] = c[0];
2405                         out4f[x*4+1] = c[1];
2406                         out4f[x*4+2] = c[2];
2407                         out4f[x*4+3] = c[3];
2408                 }
2409                 return;
2410         }
2411         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2412         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2413         flags = texture->flags;
2414         tcscale[0] = texture->mipmap[mip][2];
2415         tcscale[1] = texture->mipmap[mip][3];
2416         tciwidth = -texture->mipmap[mip][2];
2417         tcimin[0] = 0;
2418         tcimin[1] = 0;
2419         tcimax[0] = texture->mipmap[mip][2]-1;
2420         tcimax[1] = texture->mipmap[mip][3]-1;
2421         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2422         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2423         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2424         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2425         if (filter)
2426         {
2427                 endtc[0] -= 0.5f;
2428                 endtc[1] -= 0.5f;
2429         }
2430         for (x = startx;x < endx;)
2431         {
2432                 unsigned int subtc[2];
2433                 unsigned int substep[2];
2434                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2435                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2436                 if (nextsub >= endx)
2437                 {
2438                         nextsub = endsub = endx-1;      
2439                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2440                 }
2441                 tc[0] = endtc[0];
2442                 tc[1] = endtc[1];
2443                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2444                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2445                 if (filter)
2446                 {
2447                         endtc[0] -= 0.5f;
2448                         endtc[1] -= 0.5f;
2449                 }
2450                 substep[0] = (endtc[0] - tc[0]) * subscale;
2451                 substep[1] = (endtc[1] - tc[1]) * subscale;
2452                 subtc[0] = tc[0] * (1<<12);
2453                 subtc[1] = tc[1] * (1<<12);
2454                 if (filter)
2455                 {
2456                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2457                         {
2458                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2459                                 {
2460                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2461                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2462                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2463                                         tci[0] = subtc[0]>>12;
2464                                         tci[1] = subtc[1]>>12;
2465                                         tci1[0] = tci[0] + 1;
2466                                         tci1[1] = tci[1] + 1;
2467                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2468                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2469                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2470                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2471                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2472                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2473                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2474                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2475                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2476                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2477                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2478                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2479                                         out4f[x*4+0] = c[0];
2480                                         out4f[x*4+1] = c[1];
2481                                         out4f[x*4+2] = c[2];
2482                                         out4f[x*4+3] = c[3];
2483                                 }
2484                         }
2485                         else
2486                         {
2487                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2488                                 {
2489                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2490                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2491                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2492                                         tci[0] = subtc[0]>>12;
2493                                         tci[1] = subtc[1]>>12;
2494                                         tci1[0] = tci[0] + 1;
2495                                         tci1[1] = tci[1] + 1;
2496                                         tci[0] &= tciwrapmask[0];
2497                                         tci[1] &= tciwrapmask[1];
2498                                         tci1[0] &= tciwrapmask[0];
2499                                         tci1[1] &= tciwrapmask[1];
2500                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2501                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2502                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2503                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2504                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2505                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2506                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2507                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2508                                         out4f[x*4+0] = c[0];
2509                                         out4f[x*4+1] = c[1];
2510                                         out4f[x*4+2] = c[2];
2511                                         out4f[x*4+3] = c[3];
2512                                 }
2513                         }
2514                 }
2515                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2516                 {
2517                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2518                         {
2519                                 tci[0] = subtc[0]>>12;
2520                                 tci[1] = subtc[1]>>12;
2521                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2522                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2523                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2524                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2525                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2526                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2527                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2528                                 out4f[x*4+0] = c[0];
2529                                 out4f[x*4+1] = c[1];
2530                                 out4f[x*4+2] = c[2];
2531                                 out4f[x*4+3] = c[3];
2532                         }
2533                 }
2534                 else
2535                 {
2536                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2537                         {
2538                                 tci[0] = subtc[0]>>12;
2539                                 tci[1] = subtc[1]>>12;
2540                                 tci[0] &= tciwrapmask[0];
2541                                 tci[1] &= tciwrapmask[1];
2542                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2543                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2544                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2545                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2546                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2547                                 out4f[x*4+0] = c[0];
2548                                 out4f[x*4+1] = c[1];
2549                                 out4f[x*4+2] = c[2];
2550                                 out4f[x*4+3] = c[3];
2551                         }
2552                 }
2553         }
2554 }
2555 #endif
2556
2557 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2558 {
2559 #ifdef SSE_POSSIBLE
2560         int x;
2561         int startx = span->startx;
2562         int endx = span->endx;
2563         int flags;
2564         __m128 data, slope, tcscale;
2565         __m128i tcsize, tcmask, tcoffset, tcmax;
2566         __m128 tc, endtc;
2567         __m128i subtc, substep, endsubtc;
2568         int filter;
2569         int mip;
2570         int affine; // LordHavoc: optimized affine texturing case
2571         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2572         const unsigned char * RESTRICT pixelbase;
2573         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2574         // if no texture is bound, just fill it with white
2575         if (!texture)
2576         {
2577                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2578                 return;
2579         }
2580         mip = triangle->mip[texunitindex];
2581         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0] + texture->mipmap[mip][1] - 4*texture->mipmap[mip][2];
2582         // if this mipmap of the texture is 1 pixel, just fill it with that color
2583         if (texture->mipmap[mip][1] == 4)
2584         {
2585                 unsigned int k = *((const unsigned int *)pixelbase);
2586                 for (x = startx;x < endx;x++)
2587                         outi[x] = k;
2588                 return;
2589         }
2590         affine = zf[startx] == zf[endx-1];
2591         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2592         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2593         flags = texture->flags;
2594         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2595         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2596         tcscale = _mm_cvtepi32_ps(tcsize);
2597         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2598         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2599         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2600         if (filter)
2601                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2602         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2603         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_sub_epi32(_mm_setzero_si128(), _mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0))), 18), _mm_set1_epi32(4));
2604         tcmax = _mm_packs_epi32(tcmask, tcmask);
2605         for (x = startx;x < endx;)
2606         {
2607                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2608                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2609                 if (nextsub >= endx || affine)
2610                 {
2611                         nextsub = endsub = endx-1;
2612                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2613                 }       
2614                 tc = endtc;
2615                 subtc = endsubtc;
2616                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2617                 if (filter)
2618                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2619                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2620                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2621                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2622                 substep = _mm_slli_epi32(substep, 1);
2623                 if (filter)
2624                 {
2625                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2626                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2627                         {
2628                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2629                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2630                                 {
2631                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2632                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2633                                         tci = _mm_madd_epi16(tci, tcoffset);
2634                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2635                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2636                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2637                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2638                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2639                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2640                                         fracm = _mm_srli_epi16(subtc, 1);
2641                                         pix1 = _mm_add_epi16(pix1,
2642                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2643                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2644                                         pix3 = _mm_add_epi16(pix3,
2645                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2646                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2647                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2648                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2649                                         pix2 = _mm_add_epi16(pix2,
2650                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2651                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2652                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2653                                 }
2654                                 if (x <= endsub)
2655                                 {
2656                                         const unsigned char * RESTRICT ptr1;
2657                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2658                                         tci = _mm_madd_epi16(tci, tcoffset);
2659                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2660                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2661                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2662                                         fracm = _mm_srli_epi16(subtc, 1);
2663                                         pix1 = _mm_add_epi16(pix1,
2664                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2665                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2666                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2667                                         pix1 = _mm_add_epi16(pix1,
2668                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2669                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2670                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2671                                         x++;
2672                                 }
2673                         }
2674                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2675                         {
2676                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2677                                 {
2678                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2679                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2680                                         tci = _mm_madd_epi16(tci, tcoffset);
2681                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2682                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2683                                                                                         _mm_setzero_si128());
2684                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2685                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2686                                                                                         _mm_setzero_si128());
2687                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2688                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2689                                         tci = _mm_madd_epi16(tci, tcoffset);
2690                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2691                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2692                                                                                         _mm_setzero_si128());
2693                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2694                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2695                                                                                         _mm_setzero_si128());
2696                                         fracm = _mm_srli_epi16(subtc, 1);
2697                                         pix1 = _mm_add_epi16(pix1,
2698                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2699                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2700                                         pix3 = _mm_add_epi16(pix3,
2701                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2702                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2703                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2704                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2705                                         pix2 = _mm_add_epi16(pix2,
2706                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2707                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2708                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2709                                 }
2710                                 if (x <= endsub)
2711                                 {
2712                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2713                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2714                                         tci = _mm_madd_epi16(tci, tcoffset);
2715                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2716                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2717                                                                                         _mm_setzero_si128());
2718                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2719                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2720                                                                                         _mm_setzero_si128());
2721                                         fracm = _mm_srli_epi16(subtc, 1);
2722                                         pix1 = _mm_add_epi16(pix1,
2723                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2724                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2725                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2726                                         pix1 = _mm_add_epi16(pix1,
2727                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2728                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2729                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2730                                         x++;
2731                                 }
2732                         }
2733                         else
2734                         {
2735                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2736                                 {
2737                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2738                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2739                                         tci = _mm_madd_epi16(tci, tcoffset);
2740                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2741                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2742                                                                                         _mm_setzero_si128());
2743                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2744                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2745                                                                                         _mm_setzero_si128());
2746                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2747                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2748                                         tci = _mm_madd_epi16(tci, tcoffset);
2749                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2750                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2751                                                                                         _mm_setzero_si128());
2752                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2753                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2754                                                                                         _mm_setzero_si128());
2755                                         fracm = _mm_srli_epi16(subtc, 1);
2756                                         pix1 = _mm_add_epi16(pix1,
2757                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2758                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2759                                         pix3 = _mm_add_epi16(pix3,
2760                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2761                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2762                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2763                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2764                                         pix2 = _mm_add_epi16(pix2,
2765                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2766                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2767                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2768                                 }
2769                                 if (x <= endsub)
2770                                 {
2771                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2772                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2773                                         tci = _mm_madd_epi16(tci, tcoffset);
2774                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2775                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2776                                                                                         _mm_setzero_si128());
2777                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2778                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2779                                                                                         _mm_setzero_si128());
2780                                         fracm = _mm_srli_epi16(subtc, 1);
2781                                         pix1 = _mm_add_epi16(pix1,
2782                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2783                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2784                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2785                                         pix1 = _mm_add_epi16(pix1,
2786                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2787                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2788                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2789                                         x++;
2790                                 }
2791                         }
2792                 }
2793                 else
2794                 {
2795                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2796                         {
2797                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2798                                 {
2799                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2800                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2801                                         tci = _mm_madd_epi16(tci, tcoffset);
2802                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2803                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2804                                 }
2805                                 if (x <= endsub)
2806                                 {
2807                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2808                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2809                                         tci = _mm_madd_epi16(tci, tcoffset);
2810                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2811                                         x++;
2812                                 }
2813                         }
2814                         else
2815                         {
2816                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2817                                 {
2818                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2819                                         tci = _mm_and_si128(tci, tcmax); 
2820                                         tci = _mm_madd_epi16(tci, tcoffset);
2821                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2822                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2823                                 }
2824                                 if (x <= endsub)
2825                                 {
2826                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2827                                         tci = _mm_and_si128(tci, tcmax); 
2828                                         tci = _mm_madd_epi16(tci, tcoffset);
2829                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2830                                         x++;
2831                                 }
2832                         }
2833                 }
2834         }
2835 #endif
2836 }
2837
2838 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2839 {
2840         // TODO: IMPLEMENT
2841         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2842 }
2843
2844 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2845 {
2846         // TODO: IMPLEMENT
2847         return 1.0f;
2848 }
2849
2850 #if 0
2851 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2852 {
2853         int x;
2854         int startx = span->startx;
2855         int endx = span->endx;
2856         float c[4];
2857         float data[4];
2858         float slope[4];
2859         float z;
2860         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2861         for (x = startx;x < endx;x++)
2862         {
2863                 z = zf[x];
2864                 c[0] = (data[0] + slope[0]*x) * z;
2865                 c[1] = (data[1] + slope[1]*x) * z;
2866                 c[2] = (data[2] + slope[2]*x) * z;
2867                 c[3] = (data[3] + slope[3]*x) * z;
2868                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2869                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2870                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2871                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2872         }
2873 }
2874 #endif
2875
2876 #if 0
2877 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2878 {
2879         int x;
2880         int startx = span->startx;
2881         int endx = span->endx;
2882         float c[4];
2883         float data[4];
2884         float slope[4];
2885         float z;
2886         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2887         for (x = startx;x < endx;x++)
2888         {
2889                 z = zf[x];
2890                 c[0] = (data[0] + slope[0]*x) * z;
2891                 c[1] = (data[1] + slope[1]*x) * z;
2892                 c[2] = (data[2] + slope[2]*x) * z;
2893                 c[3] = (data[3] + slope[3]*x) * z;
2894                 out4f[x*4+0] = c[0];
2895                 out4f[x*4+1] = c[1];
2896                 out4f[x*4+2] = c[2];
2897                 out4f[x*4+3] = c[3];
2898         }
2899 }
2900 #endif
2901
2902 #if 0
2903 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2904 {
2905         int x, startx = span->startx, endx = span->endx;
2906         float c[4], localcolor[4];
2907         localcolor[0] = subcolor[0];
2908         localcolor[1] = subcolor[1];
2909         localcolor[2] = subcolor[2];
2910         localcolor[3] = subcolor[3];
2911         for (x = startx;x < endx;x++)
2912         {
2913                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2914                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2915                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2916                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2917                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2918                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2919                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2920                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2921         }
2922 }
2923 #endif
2924
2925 #if 0
2926 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2927 {
2928         int x, startx = span->startx, endx = span->endx;
2929         for (x = startx;x < endx;x++)
2930         {
2931                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2932                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2933                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2934                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2935         }
2936 }
2937 #endif
2938
2939 #if 0
2940 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2941 {
2942         int x, startx = span->startx, endx = span->endx;
2943         for (x = startx;x < endx;x++)
2944         {
2945                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2946                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2947                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2948                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2949         }
2950 }
2951 #endif
2952
2953 #if 0
2954 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2955 {
2956         int x, startx = span->startx, endx = span->endx;
2957         float a, b;
2958         for (x = startx;x < endx;x++)
2959         {
2960                 a = 1.0f - inb4f[x*4+3];
2961                 b = inb4f[x*4+3];
2962                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2963                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2964                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2965                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2966         }
2967 }
2968 #endif
2969
2970 #if 0
2971 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2972 {
2973         int x, startx = span->startx, endx = span->endx;
2974         float localcolor[4], ilerp, lerp;
2975         localcolor[0] = color[0];
2976         localcolor[1] = color[1];
2977         localcolor[2] = color[2];
2978         localcolor[3] = color[3];
2979         ilerp = 1.0f - localcolor[3];
2980         lerp = localcolor[3];
2981         for (x = startx;x < endx;x++)
2982         {
2983                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2984                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2985                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2986                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2987         }
2988 }
2989 #endif
2990
2991
2992
2993 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2994 {
2995 #ifdef SSE_POSSIBLE
2996         int x;
2997         int startx = span->startx;
2998         int endx = span->endx;
2999         __m128 data, slope;
3000         __m128 mod, endmod;
3001         __m128i submod, substep, endsubmod;
3002         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3003         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3004         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3005         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3006         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3007         for (x = startx; x < endx;)
3008         {
3009                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3010                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3011                 if (nextsub >= endx)
3012                 {
3013                         nextsub = endsub = endx-1;
3014                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3015                 }
3016                 mod = endmod;
3017                 submod = endsubmod;
3018                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3019                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3020                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3021                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3022                 substep = _mm_packs_epi32(substep, substep);
3023                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3024                 {
3025                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3026                         pix = _mm_mulhi_epu16(pix, submod);
3027                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3028                 }
3029                 if (x <= endsub)
3030                 {
3031                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3032                         pix = _mm_mulhi_epu16(pix, submod);
3033                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3034                         x++;
3035                 }
3036         }
3037 #endif
3038 }
3039
3040 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3041 {
3042 #ifdef SSE_POSSIBLE
3043         int x;
3044         int startx = span->startx;
3045         int endx = span->endx;
3046         __m128 data, slope;
3047         __m128 mod, endmod;
3048         __m128i submod, substep, endsubmod;
3049         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3050         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3051         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3052         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3053         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3054         for (x = startx; x < endx;)
3055         {
3056                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3057                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3058                 if (nextsub >= endx)
3059                 {
3060                         nextsub = endsub = endx-1;
3061                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3062                 }
3063                 mod = endmod;
3064                 submod = endsubmod;
3065                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3066                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3067                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3068                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3069                 substep = _mm_packs_epi32(substep, substep);
3070                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3071                 {
3072                         __m128i pix = _mm_srai_epi16(submod, 4);
3073                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3074                 }
3075                 if (x <= endsub)
3076                 {
3077                         __m128i pix = _mm_srai_epi16(submod, 4);
3078                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3079                         x++;
3080                 }
3081         }
3082 #endif
3083 }
3084
3085 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3086 {
3087 #ifdef SSE_POSSIBLE
3088         int x, startx = span->startx, endx = span->endx;
3089         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3090         localcolor = _mm_packs_epi32(localcolor, localcolor);
3091         for (x = startx;x+2 <= endx;x+=2)
3092         {
3093                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3094                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3095                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3096                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3097         }
3098         if (x < endx)
3099         {
3100                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3101                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3102                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3103                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3104         }
3105 #endif
3106 }
3107
3108 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3109 {
3110 #ifdef SSE_POSSIBLE
3111         int x, startx = span->startx, endx = span->endx;
3112         for (x = startx;x+2 <= endx;x+=2)
3113         {
3114                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3115                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3116                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3117                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3118         }
3119         if (x < endx)
3120         {
3121                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3122                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3123                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3124                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3125         }
3126 #endif
3127 }
3128
3129 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3130 {
3131 #ifdef SSE_POSSIBLE
3132         int x, startx = span->startx, endx = span->endx;
3133         for (x = startx;x+2 <= endx;x+=2)
3134         {
3135                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3136                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3137                 pix1 = _mm_add_epi16(pix1, pix2);
3138                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3139         }
3140         if (x < endx)
3141         {
3142                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3143                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3144                 pix1 = _mm_add_epi16(pix1, pix2);
3145                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3146         }
3147 #endif
3148 }
3149
3150 #if 0
3151 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3152 {
3153 #ifdef SSE_POSSIBLE
3154         int x, startx = span->startx, endx = span->endx;
3155         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3156         tint = _mm_packs_epi32(tint, tint);
3157         for (x = startx;x+2 <= endx;x+=2)
3158         {
3159                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3160                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3161                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3162                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3163         }
3164         if (x < endx)
3165         {
3166                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3167                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3168                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3169                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3170         }
3171 #endif
3172 }
3173 #endif
3174
3175 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3176 {
3177 #ifdef SSE_POSSIBLE
3178         int x, startx = span->startx, endx = span->endx;
3179         for (x = startx;x+2 <= endx;x+=2)
3180         {
3181                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3182                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3183                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3184                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3185                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3186         }
3187         if (x < endx)
3188         {
3189                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3190                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3191                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3192                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3193                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3194         }
3195 #endif
3196 }
3197
3198 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3199 {
3200 #ifdef SSE_POSSIBLE
3201         int x, startx = span->startx, endx = span->endx;
3202         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3203         localcolor = _mm_packs_epi32(localcolor, localcolor);
3204         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3205         for (x = startx;x+2 <= endx;x+=2)
3206         {
3207                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3208                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3209                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3210         }
3211         if (x < endx)
3212         {
3213                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3214                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3215                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3216         }
3217 #endif
3218 }
3219
3220
3221
3222 static void DPSOFTRAST_VertexShader_Generic(void)
3223 {
3224         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3225         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3226         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3227         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3228                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3229 }
3230
3231 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3232 {
3233         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3234         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3235         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3236         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3237         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3238         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3239         {
3240                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3241                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3242                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3243                 {
3244                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3245                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3246                         {
3247                                 // multiply
3248                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3249                         }
3250                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3251                         {
3252                                 // add
3253                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3254                         }
3255                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3256                         {
3257                                 // alphablend
3258                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3259                         }
3260                 }
3261         }
3262         else
3263                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3264         if(thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
3265         {
3266                 int x;
3267                 for (x = span->startx;x < span->endx;x++)
3268                         buffer_FragColorbgra8[x*4+3] = buffer_FragColorbgra8[x*4+3] * thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3269         }
3270         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3271 }
3272
3273
3274
3275 static void DPSOFTRAST_VertexShader_PostProcess(void)
3276 {
3277         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3278         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3279         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3280 }
3281
3282 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3283 {
3284         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3285         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3286         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3287         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3288         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3289         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3290         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3291         {
3292                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3293                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3294         }
3295         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3296         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3297         {
3298                 // TODO: implement saturation
3299         }
3300         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3301         {
3302                 // TODO: implement gammaramps
3303         }
3304         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3305 }
3306
3307
3308
3309 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3310 {
3311         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3312 }
3313
3314 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3315 {
3316         // this is never called (because colormask is off when this shader is used)
3317         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3318         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3319         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3320         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3321         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3322 }
3323
3324
3325
3326 static void DPSOFTRAST_VertexShader_FlatColor(void)
3327 {
3328         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3329         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3330 }
3331
3332 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3333 {
3334 #ifdef SSE_POSSIBLE
3335         unsigned char * RESTRICT pixelmask = span->pixelmask;
3336         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3337         int x, startx = span->startx, endx = span->endx;
3338         __m128i Color_Ambientm;
3339         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3340         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3341         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3342         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3343         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3344         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3345                 pixel = buffer_FragColorbgra8;
3346         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3347         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3348         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3349         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3350         for (x = startx;x < endx;x++)
3351         {
3352                 __m128i color, pix;
3353                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3354                 {
3355                         __m128i pix2;
3356                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3357                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3358                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3359                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3360                         x += 3;
3361                         continue;
3362                 }
3363                 if (!pixelmask[x])
3364                         continue;
3365                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3366                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3367                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3368         }
3369         if (pixel == buffer_FragColorbgra8)
3370                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3371 #endif
3372 }
3373
3374
3375
3376 static void DPSOFTRAST_VertexShader_VertexColor(void)
3377 {
3378         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3379         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3380         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3381 }
3382
3383 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3384 {
3385 #ifdef SSE_POSSIBLE
3386         unsigned char * RESTRICT pixelmask = span->pixelmask;
3387         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3388         int x, startx = span->startx, endx = span->endx;
3389         __m128i Color_Ambientm, Color_Diffusem;
3390         __m128 data, slope;
3391         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3392         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3393         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3394         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3395         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3396         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3397         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3398                 pixel = buffer_FragColorbgra8;
3399         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3400         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3401         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3402         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3403         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3404         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3405         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3406         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3407         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3408         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3409         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3410         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3411         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3412         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3413         {
3414                 __m128i color, mod, pix;
3415                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3416                 {
3417                         __m128i pix2, mod2;
3418                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3419                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3420                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3421                         data = _mm_add_ps(data, slope);
3422                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3423                         data = _mm_add_ps(data, slope);
3424                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3425                         data = _mm_add_ps(data, slope);
3426                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3427                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3428                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3429                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3430                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3431                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3432                         x += 3;
3433                         continue;
3434                 }
3435                 if (!pixelmask[x])
3436                         continue;
3437                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3438                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3439                 mod = _mm_packs_epi32(mod, mod);
3440                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3441                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3442         }
3443         if (pixel == buffer_FragColorbgra8)
3444                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3445 #endif
3446 }
3447
3448
3449
3450 static void DPSOFTRAST_VertexShader_Lightmap(void)
3451 {
3452         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3453         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3454         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3455 }
3456
3457 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3458 {
3459 #ifdef SSE_POSSIBLE
3460         unsigned char * RESTRICT pixelmask = span->pixelmask;
3461         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3462         int x, startx = span->startx, endx = span->endx;
3463         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3464         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3465         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3466         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3467         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3468         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3469         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3470         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3471         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3472         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3473                 pixel = buffer_FragColorbgra8;
3474         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3475         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3476         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3477         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3478         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3479         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3480         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3481         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3482         {
3483                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3484                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3485                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3486                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3487                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3488                 for (x = startx;x < endx;x++)
3489                 {
3490                         __m128i color, lightmap, glow, pix;
3491                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3492                         {
3493                                 __m128i pix2;
3494                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3495                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3496                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3497                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3498                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3499                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3500                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3501                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3502                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3503                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3504                                 x += 3;
3505                                 continue;
3506                         }
3507                         if (!pixelmask[x])
3508                                 continue;
3509                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3510                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3511                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3512                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3513                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3514                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3515                 }
3516         }
3517         else
3518         {
3519                 for (x = startx;x < endx;x++)
3520                 {
3521                         __m128i color, lightmap, pix;
3522                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3523                         {
3524                                 __m128i pix2;
3525                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3526                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3527                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3528                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3529                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3530                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3531                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3532                                 x += 3;
3533                                 continue;
3534                         }
3535                         if (!pixelmask[x]) 
3536                                 continue;
3537                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3538                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3539                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3540                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3541                 }
3542         }
3543         if (pixel == buffer_FragColorbgra8)
3544                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3545 #endif
3546 }
3547
3548
3549 void DPSOFTRAST_VertexShader_LightDirection(void);
3550 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3551
3552 static void DPSOFTRAST_VertexShader_FakeLight(void)
3553 {
3554         DPSOFTRAST_VertexShader_LightDirection();
3555 }
3556
3557 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3558 {
3559         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3560 }
3561
3562
3563
3564 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3565 {
3566         DPSOFTRAST_VertexShader_LightDirection();
3567         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3568 }
3569
3570 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3571 {
3572         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3573 }
3574
3575
3576
3577 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3578 {
3579         DPSOFTRAST_VertexShader_LightDirection();
3580         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3581 }
3582
3583 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3584 {
3585         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3586 }
3587
3588
3589
3590 void DPSOFTRAST_VertexShader_LightDirection(void)
3591 {
3592         int i;
3593         int numvertices = dpsoftrast.numvertices;
3594         float LightDir[4];
3595         float LightVector[4];
3596         float EyePosition[4];
3597         float EyeVectorModelSpace[4];
3598         float EyeVector[4];
3599         float position[4];
3600         float svector[4];
3601         float tvector[4];
3602         float normal[4];
3603         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3604         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3605         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3606         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3607         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3608         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3609         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3610         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3611         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3612         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3613         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3614         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3615         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3616         for (i = 0;i < numvertices;i++)
3617         {
3618                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3619                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3620                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3621                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3622                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3623                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3624                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3625                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3626                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3627                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3628                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3629                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3630                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3631                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3632                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3633                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3634                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3635                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3636                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3637                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3638                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3639                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3640                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3641                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3642                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3643                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3644                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3645                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3646                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3647         }
3648         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3649 }
3650
3651 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3652 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3653 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3654 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3655 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3656 #define DPSOFTRAST_Vector3Normalize(v)\
3657 do\
3658 {\
3659         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3660         if (len)\
3661         {\
3662                 len = 1.0f / len;\
3663                 v[0] *= len;\
3664                 v[1] *= len;\
3665                 v[2] *= len;\
3666         }\
3667 }\
3668 while(0)
3669
3670 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3671 {
3672         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3673         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3674         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3675         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3676         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3677         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3678         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3679         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3680         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3681         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3682         int x, startx = span->startx, endx = span->endx;
3683         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3684         float LightVectordata[4];
3685         float LightVectorslope[4];
3686         float EyeVectordata[4];
3687         float EyeVectorslope[4];
3688         float VectorSdata[4];
3689         float VectorSslope[4];
3690         float VectorTdata[4];
3691         float VectorTslope[4];
3692         float VectorRdata[4];
3693         float VectorRslope[4];
3694         float z;
3695         float diffusetex[4];
3696         float glosstex[4];
3697         float surfacenormal[4];
3698         float lightnormal[4];
3699         float lightnormal_modelspace[4];
3700         float eyenormal[4];
3701         float specularnormal[4];
3702         float diffuse;
3703         float specular;
3704         float SpecularPower;
3705         int d[4];
3706         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3707         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3708         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3709         Color_Glow[3] = 0.0f;
3710         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3711         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3712         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3713         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3714         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3715         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3716         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3717         Color_Pants[3] = 0.0f;
3718         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3719         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3720         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3721         Color_Shirt[3] = 0.0f;
3722         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3723         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3724         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3725         {
3726                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3727                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3728         }
3729         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3730         {
3731                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3732         }
3733         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3734         {
3735                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3736                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3737                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3738                 Color_Diffuse[3] = 0.0f;
3739                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3740                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3741                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3742                 LightColor[3] = 0.0f;
3743                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3744                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3745                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3746                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3747                 Color_Specular[3] = 0.0f;
3748                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3749                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3750                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3751
3752                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3753                 {
3754                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3755                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3756                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3757                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3758                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3759                 }
3760                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3761                 {
3762                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3763                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3764                 }
3765                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3766                 {
3767                         // nothing of this needed
3768                 }
3769                 else
3770                 {
3771                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3772                 }
3773
3774                 for (x = startx;x < endx;x++)
3775                 {
3776                         z = buffer_z[x];
3777                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3778                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3779                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3780                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3781                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3782                         {
3783                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3784                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3785                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3786                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3787                         }
3788                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3789                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3790                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3791                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3792                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3793                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3794                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3795                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3796
3797                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3798                         {
3799                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3800                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3801                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3802                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3803
3804                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3805                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3806                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3807                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3808
3809                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3810                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3811                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3812                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3813
3814                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3815                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3816                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3817                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3818
3819                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3820                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3821
3822                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3823                                 {
3824                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3825                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3826                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3827                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3828                                 }
3829                         }
3830                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3831                         {
3832                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3833                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3834                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3835                                 {
3836                                         float f = 1.0f / 256.0f;
3837                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3838                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3839                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3840                                 }
3841                         }
3842                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3843                         {
3844                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3845                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3846                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3847                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3848
3849                                 LightColor[0] = 1.0;
3850                                 LightColor[1] = 1.0;
3851                                 LightColor[2] = 1.0;
3852                         }
3853                         else
3854                         {
3855                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3856                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3857                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3858                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3859                         }
3860
3861                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3862
3863                         if(thread->shader_exactspecularmath)
3864                         {
3865                                 // reflect lightnormal at surfacenormal, take the negative of that
3866                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3867                                 float f;
3868                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3869                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3870                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3871                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3872
3873                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3874                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3875                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3876                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3877                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3878
3879                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3880                         }
3881                         else
3882                         {
3883                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3884                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3885                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3886                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3887
3888                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3889                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3890                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3891                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3892
3893                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3894                         }
3895                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3896
3897                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3898                         {
3899                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3900                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3901                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3902                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3903                         }
3904                         else
3905                         {
3906                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3907                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3908                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3909                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3910                         }
3911
3912                         buffer_FragColorbgra8[x*4+0] = d[0];
3913                         buffer_FragColorbgra8[x*4+1] = d[1];
3914                         buffer_FragColorbgra8[x*4+2] = d[2];
3915                         buffer_FragColorbgra8[x*4+3] = d[3];
3916                 }
3917         }
3918         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3919         {
3920                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3921                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3922                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3923                 Color_Diffuse[3] = 0.0f;
3924                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3925                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3926                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3927                 LightColor[3] = 0.0f;
3928                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3929
3930                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3931                 {
3932                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3933                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3934                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3935                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3936                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3937                 }
3938                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3939                 {
3940                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3941                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3942                 }
3943                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3944                 {
3945                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3946                 }
3947                 else
3948                 {
3949                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3950                 }
3951
3952                 for (x = startx;x < endx;x++)
3953                 {
3954                         z = buffer_z[x];
3955                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3956                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3957                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3958                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3959                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3960                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3961                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3962                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3963
3964                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3965                         {
3966                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3967                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3968                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3969                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3970
3971                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3972                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3973                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3974                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3975
3976                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3977                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3978                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3979                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3980
3981                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3982                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3983                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3984                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3985
3986                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3987                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3988
3989                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3990                                 {
3991                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3992                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3993                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3994                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3995                                 }
3996                         }
3997                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3998                         {
3999                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4000                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4001                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4002                                 {
4003                                         float f = 1.0f / 256.0f;
4004                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4005                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4006                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4007                                 }
4008                         }
4009                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4010                         {
4011                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4012                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4013                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4014                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4015
4016                                 LightColor[0] = 1.0;
4017                                 LightColor[1] = 1.0;
4018                                 LightColor[2] = 1.0;
4019                         }
4020                         else
4021                         {
4022                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4023                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4024                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4025                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4026                         }
4027
4028                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4029                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4030                         {
4031                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4032                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4033                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4034                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4035                         }
4036                         else
4037                         {
4038                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4039                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4040                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4041                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4042                         }
4043                         buffer_FragColorbgra8[x*4+0] = d[0];
4044                         buffer_FragColorbgra8[x*4+1] = d[1];
4045                         buffer_FragColorbgra8[x*4+2] = d[2];
4046                         buffer_FragColorbgra8[x*4+3] = d[3];
4047                 }
4048         }
4049         else
4050         {
4051                 for (x = startx;x < endx;x++)
4052                 {
4053                         z = buffer_z[x];
4054                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4055                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4056                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4057                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4058
4059                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4060                         {
4061                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4062                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4063                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4064                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4065                         }
4066                         else
4067                         {
4068                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4069                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4070                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4071                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4072                         }
4073                         buffer_FragColorbgra8[x*4+0] = d[0];
4074                         buffer_FragColorbgra8[x*4+1] = d[1];
4075                         buffer_FragColorbgra8[x*4+2] = d[2];
4076                         buffer_FragColorbgra8[x*4+3] = d[3];
4077                 }
4078         }
4079         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4080 }
4081
4082
4083
4084 static void DPSOFTRAST_VertexShader_LightSource(void)
4085 {
4086         int i;
4087         int numvertices = dpsoftrast.numvertices;
4088         float LightPosition[4];
4089         float LightVector[4];
4090         float LightVectorModelSpace[4];
4091         float EyePosition[4];
4092         float EyeVectorModelSpace[4];
4093         float EyeVector[4];
4094         float position[4];
4095         float svector[4];
4096         float tvector[4];
4097         float normal[4];
4098         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4099         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4100         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4101         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4102         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4103         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4104         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4105         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4106         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4107         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4108         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4109         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4110         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4111         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4112         for (i = 0;i < numvertices;i++)
4113         {
4114                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4115                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4116                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4117                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4118                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4119                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4120                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4121                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4122                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4123                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4124                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4125                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4126                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4127                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4128                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4129                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4130                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4131                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4132                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4133                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4134                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4135                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4136                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4137                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4138                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4139                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4140                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4141                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4142                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4143                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4144                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4145                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4146         }
4147         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4148         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4149 }
4150
4151 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4152 {
4153 #ifdef SSE_POSSIBLE
4154         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4155         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4156         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4157         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4158         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4159         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4160         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4161         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4162         int x, startx = span->startx, endx = span->endx;
4163         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4164         float CubeVectordata[4];
4165         float CubeVectorslope[4];
4166         float LightVectordata[4];
4167         float LightVectorslope[4];
4168         float EyeVectordata[4];
4169         float EyeVectorslope[4];
4170         float z;
4171         float diffusetex[4];
4172         float glosstex[4];
4173         float surfacenormal[4];
4174         float lightnormal[4];
4175         float eyenormal[4];
4176         float specularnormal[4];
4177         float diffuse;
4178         float specular;
4179         float SpecularPower;
4180         float CubeVector[4];
4181         float attenuation;
4182         int d[4];
4183         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4184         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4185         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4186         Color_Glow[3] = 0.0f;
4187         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4188         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4189         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4190         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4191         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4192         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4193         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4194         Color_Diffuse[3] = 0.0f;
4195         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4196         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4197         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4198         Color_Specular[3] = 0.0f;
4199         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4200         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4201         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4202         Color_Pants[3] = 0.0f;
4203         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4204         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4205         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4206         Color_Shirt[3] = 0.0f;
4207         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4208         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4209         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4210         LightColor[3] = 0.0f;
4211         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4212         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4213         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4214         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4215         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4216         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4217         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4218         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4219         {
4220                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4221                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4222         }
4223         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4224                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4225         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4226         {
4227                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4228                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4229                 for (x = startx;x < endx;x++)
4230                 {
4231                         z = buffer_z[x];
4232                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4233                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4234                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4235                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4236                         if (attenuation < 0.01f)
4237                                 continue;
4238                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4239                         {
4240                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4241                                 if (attenuation < 0.01f)
4242                                         continue;
4243                         }
4244
4245                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4246                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4247                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4248                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4249                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4250                         {
4251                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4252                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4253                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4254                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4255                         }
4256                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4257                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4258                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4259                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4260                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4261                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4262                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4263                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4264
4265                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4266                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4267                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4268                         DPSOFTRAST_Vector3Normalize(lightnormal);
4269
4270                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4271
4272                         if(thread->shader_exactspecularmath)
4273                         {
4274                                 // reflect lightnormal at surfacenormal, take the negative of that
4275                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4276                                 float f;
4277                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4278                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4279                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4280                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4281
4282                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4283                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4284                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4285                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4286                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4287
4288                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4289                         }
4290                         else
4291                         {
4292                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4293                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4294                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4295                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4296
4297                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4298                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4299                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4300                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4301
4302                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4303                         }
4304                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4305
4306                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4307                         {
4308                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4309                                 attenuation *= (1.0f / 255.0f);
4310                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4311                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4312                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4313                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4314                         }
4315                         else
4316                         {
4317                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4318                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4319                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4320                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4321                         }
4322                         buffer_FragColorbgra8[x*4+0] = d[0];
4323                         buffer_FragColorbgra8[x*4+1] = d[1];
4324                         buffer_FragColorbgra8[x*4+2] = d[2];
4325                         buffer_FragColorbgra8[x*4+3] = d[3];
4326                 }
4327         }
4328         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4329         {
4330                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4331                 for (x = startx;x < endx;x++)
4332                 {
4333                         z = buffer_z[x];
4334                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4335                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4336                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4337                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4338                         if (attenuation < 0.01f)
4339                                 continue;
4340                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4341                         {
4342                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4343                                 if (attenuation < 0.01f)
4344                                         continue;
4345                         }
4346
4347                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4348                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4349                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4350                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4351                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4352                         {
4353                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4354                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4355                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4356                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4357                         }
4358                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4359                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4360                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4361                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4362
4363                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4364                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4365                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4366                         DPSOFTRAST_Vector3Normalize(lightnormal);
4367
4368                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4369                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4370                         {
4371                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4372                                 attenuation *= (1.0f / 255.0f);
4373                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4374                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4375                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4376                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4377                         }
4378                         else
4379                         {
4380                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4381                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4382                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4383                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4384                         }
4385                         buffer_FragColorbgra8[x*4+0] = d[0];
4386                         buffer_FragColorbgra8[x*4+1] = d[1];
4387                         buffer_FragColorbgra8[x*4+2] = d[2];
4388                         buffer_FragColorbgra8[x*4+3] = d[3];
4389                 }
4390         }
4391         else
4392         {
4393                 for (x = startx;x < endx;x++)
4394                 {
4395                         z = buffer_z[x];
4396                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4397                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4398                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4399                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4400                         if (attenuation < 0.01f)
4401                                 continue;
4402                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4403                         {
4404                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4405                                 if (attenuation < 0.01f)
4406                                         continue;
4407                         }
4408
4409                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4410                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4411                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4412                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4413                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4414                         {
4415                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4416                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4417                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4418                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4419                         }
4420                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4421                         {
4422                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4423                                 attenuation *= (1.0f / 255.0f);
4424                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4425                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4426                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4427                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4428                         }
4429                         else
4430                         {
4431                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4432                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4433                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4434                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4435                         }
4436                         buffer_FragColorbgra8[x*4+0] = d[0];
4437                         buffer_FragColorbgra8[x*4+1] = d[1];
4438                         buffer_FragColorbgra8[x*4+2] = d[2];
4439                         buffer_FragColorbgra8[x*4+3] = d[3];
4440                 }
4441         }
4442         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4443 #endif
4444 }
4445
4446
4447
4448 static void DPSOFTRAST_VertexShader_Refraction(void)
4449 {
4450         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4451         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4452         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4453 }
4454
4455 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4456 {
4457         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4458         float z;
4459         int x, startx = span->startx, endx = span->endx;
4460
4461         // texture reads
4462         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4463         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4464
4465         // varyings
4466         float ModelViewProjectionPositiondata[4];
4467         float ModelViewProjectionPositionslope[4];
4468
4469         // uniforms
4470         float ScreenScaleRefractReflect[2];
4471         float ScreenCenterRefractReflect[2];
4472         float DistortScaleRefractReflect[2];
4473         float RefractColor[4];
4474
4475         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4476         if(!texture) return;
4477
4478         // read textures
4479         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4480         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4481
4482         // read varyings
4483         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4484
4485         // read uniforms
4486         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4487         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4488         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4489         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4490         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4491         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4492         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4493         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4494         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4495         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4496
4497         // do stuff
4498         for (x = startx;x < endx;x++)
4499         {
4500                 float SafeScreenTexCoord[2];
4501                 float ScreenTexCoord[2];
4502                 float v[3];
4503                 float iw;
4504                 unsigned char c[4];
4505
4506                 z = buffer_z[x];
4507
4508                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4509                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4510
4511                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4512                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4513                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4514
4515                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4516                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4517                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4518                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4519                 DPSOFTRAST_Vector3Normalize(v);
4520                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4521                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4522
4523                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4524                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4525
4526                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4527                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4528                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4529                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4530         }
4531
4532         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4533 }
4534
4535
4536
4537 static void DPSOFTRAST_VertexShader_Water(void)
4538 {
4539         int i;
4540         int numvertices = dpsoftrast.numvertices;
4541         float EyePosition[4];
4542         float EyeVectorModelSpace[4];
4543         float EyeVector[4];
4544         float position[4];
4545         float svector[4];
4546         float tvector[4];
4547         float normal[4];
4548         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4549         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4550         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4551         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4552         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4553         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4554         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4555         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4556         for (i = 0;i < numvertices;i++)
4557         {
4558                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4559                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4560                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4561                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4562                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4563                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4564                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4565                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4566                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4567                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4568                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4569                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4570                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4571                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4572                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4573                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4574                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4575                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4576                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4577                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4578                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4579                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4580         }
4581         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4582         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4583         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4584 }
4585
4586
4587 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4588 {
4589         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4590         float z;
4591         int x, startx = span->startx, endx = span->endx;
4592
4593         // texture reads
4594         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4595         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4596
4597         // varyings
4598         float ModelViewProjectionPositiondata[4];
4599         float ModelViewProjectionPositionslope[4];
4600         float EyeVectordata[4];
4601         float EyeVectorslope[4];
4602
4603         // uniforms
4604         float ScreenScaleRefractReflect[4];
4605         float ScreenCenterRefractReflect[4];
4606         float DistortScaleRefractReflect[4];
4607         float RefractColor[4];
4608         float ReflectColor[4];
4609         float ReflectFactor;
4610         float ReflectOffset;
4611
4612         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4613         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4614         if(!texture_refraction || !texture_reflection) return;
4615
4616         // read textures
4617         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4618         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4619
4620         // read varyings
4621         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4622         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4623
4624         // read uniforms
4625         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4626         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4627         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4628         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4629         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4630         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4631         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4632         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4633         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4634         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4635         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4636         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4637         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4638         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4639         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4640         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4641         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4642         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4643         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4644         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4645         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4646         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4647
4648         // do stuff
4649         for (x = startx;x < endx;x++)
4650         {
4651                 float SafeScreenTexCoord[4];
4652                 float ScreenTexCoord[4];
4653                 float v[3];
4654                 float iw;
4655                 unsigned char c1[4];
4656                 unsigned char c2[4];
4657                 float Fresnel;
4658
4659                 z = buffer_z[x];
4660
4661                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4662                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4663
4664                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4665                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4666                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4667                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4668                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4669
4670                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4671                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4672                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4673                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4674                 DPSOFTRAST_Vector3Normalize(v);
4675                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4676                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4677                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4678                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4679
4680                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4681                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4682                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4683                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4684                 DPSOFTRAST_Vector3Normalize(v);
4685                 Fresnel = 1.0f - v[2];
4686                 Fresnel = min(1.0f, Fresnel);
4687                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4688
4689                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4690                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4691                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4692                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4693
4694                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4695                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4696                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4697                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4698         }
4699
4700         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4701 }
4702
4703
4704
4705 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4706 {
4707         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4708 }
4709
4710 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4711 {
4712         // TODO: IMPLEMENT
4713         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4714         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4715         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4716         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4717         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4718 }
4719
4720
4721
4722 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4723 {
4724         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4725 }
4726
4727 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4728 {
4729         // TODO: IMPLEMENT
4730         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4731         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4732         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4733         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4734         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4735 }
4736
4737
4738
4739 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4740 {
4741         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4742 }
4743
4744 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4745 {
4746         // TODO: IMPLEMENT
4747         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4748         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4749         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4750         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4751         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4752 }
4753
4754
4755
4756 typedef struct DPSOFTRAST_ShaderModeInfo_s
4757 {
4758         int lodarrayindex;
4759         void (*Vertex)(void);
4760         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4761         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4762         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4763 }
4764 DPSOFTRAST_ShaderModeInfo;
4765
4766 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4767 {
4768         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4769         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4770         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4771         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4772         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4773         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4774         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4775         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4776         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4777         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4778         {2, DPSOFTRAST_VertexShader_VertexColor,                        DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4779         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4780         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4781         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4782         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4783         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4784         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4785         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4786 };
4787
4788 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4789 {
4790         int x;
4791         int startx;
4792         int endx;
4793         unsigned int *depthpixel;
4794         int depth;
4795         int depthslope;
4796         unsigned int d;
4797         unsigned char *pixelmask;
4798         DPSOFTRAST_State_Triangle *triangle;
4799         triangle = &thread->triangles[span->triangle];
4800         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4801         startx = span->startx;
4802         endx = span->endx;
4803         depth = span->depthbase;
4804         depthslope = span->depthslope;
4805         pixelmask = thread->pixelmaskarray;
4806         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4807         {
4808                 switch(thread->fb_depthfunc)
4809                 {
4810                 default:
4811                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4812                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4813                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4814                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4815                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4816                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4817                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4818                 }
4819                 while (startx < endx && !pixelmask[startx])
4820                         startx++;
4821                 while (endx > startx && !pixelmask[endx-1])
4822                         endx--;
4823         }
4824         else
4825         {
4826                 // no depth testing means we're just dealing with color...
4827                 memset(pixelmask + startx, 1, endx - startx);
4828         }
4829         span->pixelmask = pixelmask;
4830         span->startx = startx;
4831         span->endx = endx;
4832 }
4833
4834 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4835 {
4836         int x, d, depth, depthslope, startx, endx;
4837         const unsigned char *pixelmask;
4838         unsigned int *depthpixel;
4839         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4840         {
4841                 depth = span->depthbase;
4842                 depthslope = span->depthslope;
4843                 pixelmask = span->pixelmask;
4844                 startx = span->startx;
4845                 endx = span->endx;
4846                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4847                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4848                         if (pixelmask[x])
4849                                 depthpixel[x] = d;
4850         }
4851 }
4852
4853 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4854 {
4855         int i;
4856         DPSOFTRAST_State_Triangle *triangle;
4857         DPSOFTRAST_State_Span *span;
4858         for (i = 0; i < thread->numspans; i++)
4859         {
4860                 span = &thread->spans[i];
4861                 triangle = &thread->triangles[span->triangle];
4862                 DPSOFTRAST_Draw_DepthTest(thread, span);
4863                 if (span->startx >= span->endx)
4864                         continue;
4865                 // run pixel shader if appropriate
4866                 // do this before running depthmask code, to allow the pixelshader
4867                 // to clear pixelmask values for alpha testing
4868                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4869                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4870                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4871         }
4872         thread->numspans = 0;
4873 }
4874
4875 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4876
4877 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4878 {
4879 #ifdef SSE_POSSIBLE
4880         int cullface = thread->cullface;
4881         int minx, maxx, miny, maxy;
4882         int miny1, maxy1, miny2, maxy2;
4883         __m128i fbmin, fbmax;
4884         __m128 viewportcenter, viewportscale;
4885         int firstvertex = command->firstvertex;
4886         int numvertices = command->numvertices;
4887         int numtriangles = command->numtriangles;
4888         const int *element3i = command->element3i;
4889         const unsigned short *element3s = command->element3s;
4890         int clipped = command->clipped;
4891         int i;
4892         int j;
4893         int k;
4894         int y;
4895         int e[3];
4896         __m128i screeny;
4897         int starty, endy, bandy;
4898         int numpoints;
4899         int clipcase;
4900         float clipdist[4];
4901         float clip0origin, clip0slope;
4902         int clip0dir;
4903         __m128 triangleedge1, triangleedge2, trianglenormal;
4904         __m128 clipfrac[3];
4905         __m128 screen[4];
4906         DPSOFTRAST_State_Triangle *triangle;
4907         DPSOFTRAST_Texture *texture;
4908         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4909         miny = thread->fb_scissor[1];
4910         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4911         miny1 = bound(miny, thread->miny1, maxy);
4912         maxy1 = bound(miny, thread->maxy1, maxy);
4913         miny2 = bound(miny, thread->miny2, maxy);
4914         maxy2 = bound(miny, thread->maxy2, maxy);
4915         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4916         {
4917                 if (!ATOMIC_DECREMENT(command->refcount))
4918                 {
4919                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4920                                 MM_FREE(command->arrays);
4921                 }
4922                 return;
4923         }
4924         minx = thread->fb_scissor[0];
4925         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4926         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4927         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4928         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4929         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4930         screen[3] = _mm_setzero_ps();
4931         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4932         for (i = 0;i < numtriangles;i++)
4933         {
4934                 const float *screencoord4f = command->arrays;
4935                 const float *arrays = screencoord4f + numvertices*4;
4936
4937                 // generate the 3 edges of this triangle
4938                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4939                 if (element3s)
4940                 {
4941                         e[0] = element3s[i*3+0] - firstvertex;
4942                         e[1] = element3s[i*3+1] - firstvertex;
4943                         e[2] = element3s[i*3+2] - firstvertex;
4944                 }
4945                 else if (element3i)
4946                 {
4947                         e[0] = element3i[i*3+0] - firstvertex;
4948                         e[1] = element3i[i*3+1] - firstvertex;
4949                         e[2] = element3i[i*3+2] - firstvertex;
4950                 }
4951                 else
4952                 {
4953                         e[0] = i*3+0;
4954                         e[1] = i*3+1;
4955                         e[2] = i*3+2;
4956                 }
4957
4958 #define SKIPBACKFACE \
4959                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4960                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4961                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4962                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4963                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4964                 switch(cullface) \
4965                 { \
4966                 case GL_BACK: \
4967                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4968                                 continue; \
4969                         break; \
4970                 case GL_FRONT: \
4971                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4972                                 continue; \
4973                         break; \
4974                 }
4975
4976 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4977                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4978                         { \
4979                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4980                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4981                         }
4982 #define CLIPPEDVERTEXCOPY(k,p1) \
4983                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4984
4985 #define GENATTRIBCOPY(attrib, p1) \
4986                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4987 #define GENATTRIBLERP(attrib, p1, p2) \
4988                 { \
4989                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4990                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4991                 }
4992 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4993                 switch(clipcase) \
4994                 { \
4995                 default: \
4996                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4997                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4998                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4999                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
5000                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
5001                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
5002                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
5003                 }
5004
5005                 if (! clipped)
5006                         goto notclipped;
5007
5008                 // calculate distance from nearplane
5009                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5010                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5011                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5012                 if (clipdist[0] >= 0.0f)
5013                 {
5014                         if (clipdist[1] >= 0.0f)
5015                         {
5016                                 if (clipdist[2] >= 0.0f)
5017                                 {
5018                                 notclipped:
5019                                         // triangle is entirely in front of nearplane
5020                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5021                                         SKIPBACKFACE;
5022                                         numpoints = 3;
5023                                         clipcase = 0;
5024                                 }
5025                                 else
5026                                 {
5027                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5028                                         SKIPBACKFACE;
5029                                         numpoints = 4;
5030                                         clipcase = 1;
5031                                 }
5032                         }
5033                         else
5034                         {
5035                                 if (clipdist[2] >= 0.0f)
5036                                 {
5037                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5038                                         SKIPBACKFACE;
5039                                         numpoints = 4;
5040                                         clipcase = 2;
5041                                 }
5042                                 else
5043                                 {
5044                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5045                                         SKIPBACKFACE;
5046                                         numpoints = 3;
5047                                         clipcase = 3;
5048                                 }
5049                         }
5050                 }
5051                 else if (clipdist[1] >= 0.0f)
5052                 {
5053                         if (clipdist[2] >= 0.0f)
5054                         {
5055                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5056                                 SKIPBACKFACE;
5057                                 numpoints = 4;
5058                                 clipcase = 4;
5059                         }
5060                         else
5061                         {
5062                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5063                                 SKIPBACKFACE;
5064                                 numpoints = 3;
5065                                 clipcase = 5;
5066                         }
5067                 }
5068                 else if (clipdist[2] >= 0.0f)
5069                 {
5070                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5071                         SKIPBACKFACE;
5072                         numpoints = 3;
5073                         clipcase = 6;
5074                 }
5075                 else continue; // triangle is entirely behind nearplane
5076
5077                 {
5078                         // calculate integer y coords for triangle points
5079                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5080                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5081                                         screenmin = _mm_min_epi16(screeni, screenir),
5082                                         screenmax = _mm_max_epi16(screeni, screenir);
5083                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5084                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5085                         screenmin = _mm_max_epi16(screenmin, fbmin);
5086                         screenmax = _mm_min_epi16(screenmax, fbmax);
5087                         // skip offscreen triangles
5088                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5089                                 continue;
5090                         starty = _mm_extract_epi16(screenmin, 1);
5091                         endy = _mm_extract_epi16(screenmax, 1)+1;
5092                         if (starty >= maxy1 && endy <= miny2)
5093                                 continue;
5094                         screeny = _mm_srai_epi32(screeni, 16);
5095                 }
5096
5097                 triangle = &thread->triangles[thread->numtriangles];
5098
5099                 // calculate attribute plans for triangle data...
5100                 // okay, this triangle is going to produce spans, we'd better project
5101                 // the interpolants now (this is what gives perspective texturing),
5102                 // this consists of simply multiplying all arrays by the W coord
5103                 // (which is basically 1/Z), which will be undone per-pixel
5104                 // (multiplying by Z again) to get the perspective-correct array
5105                 // values
5106                 {
5107                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5108                         __m128 mipedgescale, mipdensity;
5109                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5110                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5111                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5112                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5113                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5114                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5115                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5116                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5117                         attribedge1 = _mm_sub_ss(w0, w1);
5118                         attribedge2 = _mm_sub_ss(w2, w1);
5119                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5120                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5121                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5122                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5123                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5124                         _mm_store_ss(&triangle->w[0], attribxslope);
5125                         _mm_store_ss(&triangle->w[1], attribyslope);
5126                         _mm_store_ss(&triangle->w[2], attriborigin);
5127                         
5128                         clip0origin = 0;
5129                         clip0slope = 0;
5130                         clip0dir = 0;
5131                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5132                         {
5133                                 float cliporigin, clipxslope, clipyslope;
5134                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5135                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5136                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5137                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5138                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5139                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5140                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5141                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5142                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5143                                 if(clipxslope != 0)
5144                                 {
5145                                         clip0origin = -cliporigin/clipxslope;
5146                                         clip0slope = -clipyslope/clipxslope;
5147                                         clip0dir = clipxslope > 0 ? 1 : -1;
5148                                 }
5149                                 else if(clipyslope > 0)
5150                                 {
5151                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5152                                         clip0slope = dpsoftrast.fb_width;
5153                                         clip0dir = -1;
5154                                 }
5155                                 else if(clipyslope < 0)
5156                                 {
5157                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5158                                         clip0slope = -dpsoftrast.fb_width;
5159                                         clip0dir = -1;
5160                                 }
5161                                 else if(clip0origin < 0) continue;
5162                         }
5163
5164                         mipedgescale = _mm_setzero_ps();
5165                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5166                         {
5167                                 __m128 attrib0, attrib1, attrib2;
5168                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5169                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5170                                         break;
5171                                 arrays += numvertices*4;
5172                                 GENATTRIBS(attrib0, attrib1, attrib2);
5173                                 attriborigin = _mm_mul_ps(attrib1, w1);
5174                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5175                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5176                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5177                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5178                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5179                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5180                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5181                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5182                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5183                                 {
5184                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5185                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5186                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5187                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5188                                 }
5189                         }
5190
5191                         memset(triangle->mip, 0, sizeof(triangle->mip));
5192                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5193                         {
5194                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5195                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5196                                         break;
5197                                 texture = thread->texbound[texunit];
5198                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5199                                 {
5200                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5201                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5202                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5203                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5204                                         // this will be multiplied in the texturing routine by the texture resolution
5205                                         y = _mm_cvtss_si32(mipdensity);
5206                                         if (y > 0)
5207                                         {
5208                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5209                                                 if (y > texture->mipmaps - 1)
5210                                                         y = texture->mipmaps - 1;
5211                                                 triangle->mip[texunit] = y;
5212                                         }
5213                                 }
5214                         }
5215                 }
5216         
5217                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5218                 for (; y < bandy;)
5219                 {
5220                         __m128 xcoords, xslope;
5221                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5222                         int yccmask = _mm_movemask_epi8(ycc);
5223                         int edge0p, edge0n, edge1p, edge1n;
5224                         int nexty;
5225                         float w, wslope;
5226                         float clip0;
5227                         if (numpoints == 4)
5228                         {
5229                                 switch(yccmask)
5230                                 {
5231                                 default:
5232                                 case 0xFFFF: /*0000*/ y = endy; continue;
5233                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5234                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5235                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5236                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5237                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5238                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5239                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5240                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5241                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5242                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5243                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5244                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5245                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5246                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5247                                 case 0x0000: /*1111*/ y++; continue;
5248                                 }
5249                         }
5250                         else
5251                         {
5252                                 switch(yccmask)
5253                                 {
5254                                 default:
5255                                 case 0xFFFF: /*000*/ y = endy; continue;
5256                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5257                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5258                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5259                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5260                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5261                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5262                                 case 0x0000: /*111*/ y++; continue;
5263                                 }
5264                         }
5265                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5266                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5267                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5268                         nexty = _mm_extract_epi16(ycc, 0);
5269                         if (nexty >= bandy) nexty = bandy-1;
5270                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5271                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5272                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5273                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5274                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5275                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5276                         {
5277                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5278                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5279                         }
5280                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5281                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5282                         {
5283                                 int startx, endx, offset;
5284                                 startx = _mm_cvtss_si32(xcoords);
5285                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5286                                 if (startx < minx) startx = minx;
5287                                 if (endx > maxx) endx = maxx;
5288                                 if (startx >= endx) continue;
5289
5290                                 if (clip0dir)
5291                                 {
5292                                         if (clip0dir > 0)
5293                                         {
5294                                                 if (startx < clip0) 
5295                                                 {
5296                                                         if(endx <= clip0) continue;
5297                                                         startx = (int)clip0;
5298                                                 }
5299                                         }
5300                                         else if (endx > clip0) 
5301                                         {
5302                                                 if(startx >= clip0) continue;
5303                                                 endx = (int)clip0;
5304                                         }
5305                                 }
5306                                                 
5307                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5308                                 {
5309                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5310                                         span->triangle = thread->numtriangles;
5311                                         span->x = offset;
5312                                         span->y = y;
5313                                         span->startx = 0;
5314                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5315                                         if (span->startx >= span->endx)
5316                                                 continue;
5317                                         wslope = triangle->w[0];
5318                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5319                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5320                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5321                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5322                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5323                                 }
5324                         }
5325                 }
5326
5327                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5328                 {
5329                         DPSOFTRAST_Draw_ProcessSpans(thread);
5330                         thread->numtriangles = 0;
5331                 }
5332         }
5333
5334         if (!ATOMIC_DECREMENT(command->refcount))
5335         {
5336                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5337                         MM_FREE(command->arrays);
5338         }
5339
5340         if (thread->numspans > 0 || thread->numtriangles > 0)
5341         {
5342                 DPSOFTRAST_Draw_ProcessSpans(thread);
5343                 thread->numtriangles = 0;
5344         }
5345 #endif
5346 }
5347
5348 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5349 {
5350         int i;
5351         int j;
5352         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5353         int datasize = 2*numvertices*sizeof(float[4]);
5354         DPSOFTRAST_Command_Draw *command;
5355         unsigned char *data;
5356         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5357         {
5358                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5359                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5360                         break;
5361                 datasize += numvertices*sizeof(float[4]);
5362         }
5363         if (element3s)
5364                 datasize += numtriangles*sizeof(unsigned short[3]);
5365         else if (element3i)
5366                 datasize += numtriangles*sizeof(int[3]);
5367         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5368         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5369         {
5370                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5371                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5372         }
5373         else
5374         {
5375                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5376                 data = (unsigned char *)command + commandsize;
5377         }
5378         command->firstvertex = firstvertex;
5379         command->numvertices = numvertices;
5380         command->numtriangles = numtriangles;
5381         command->arrays = (float *)data;
5382         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5383         dpsoftrast.firstvertex = firstvertex;
5384         dpsoftrast.numvertices = numvertices;
5385         dpsoftrast.screencoord4f = (float *)data;
5386         data += numvertices*sizeof(float[4]);
5387         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5388         data += numvertices*sizeof(float[4]);
5389         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5390         {
5391                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5392                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5393                         break;
5394                 dpsoftrast.post_array4f[j] = (float *)data;
5395                 data += numvertices*sizeof(float[4]);
5396         }
5397         command->element3i = NULL;
5398         command->element3s = NULL;
5399         if (element3s)
5400         {
5401                 command->element3s = (unsigned short *)data;
5402                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5403         }
5404         else if (element3i)
5405         {
5406                 command->element3i = (int *)data;
5407                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5408         }
5409         return command;
5410 }
5411
5412 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5413 {
5414         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5415         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5416         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5417         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5418         if (command->starty >= command->endy)
5419         {
5420                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5421                         MM_FREE(command->arrays);
5422                 DPSOFTRAST_UndoCommand(command->commandsize);
5423                 return;
5424         }
5425         command->clipped = dpsoftrast.drawclipped;
5426         command->refcount = dpsoftrast.numthreads;
5427
5428         if (dpsoftrast.usethreads)
5429         {
5430                 int i;
5431                 DPSOFTRAST_Draw_SyncCommands();
5432                 for (i = 0; i < dpsoftrast.numthreads; i++)
5433                 {
5434                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5435                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5436                                 Thread_CondSignal(thread->drawcond);
5437                 }
5438         }
5439         else
5440         {
5441                 DPSOFTRAST_Draw_FlushThreads();
5442         }
5443 }
5444
5445 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5446 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5447 {
5448         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5449 }
5450 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5451 {
5452         DPSOFTRAST_Command_SetRenderTargets *command;
5453         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5454                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5455                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5456                 DPSOFTRAST_Flush();
5457         dpsoftrast.fb_width = width;
5458         dpsoftrast.fb_height = height;
5459         dpsoftrast.fb_depthpixels = depthpixels;
5460         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5461         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5462         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5463         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5464         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5465         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5466         command->width = width;
5467         command->height = height;
5468 }
5469  
5470 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5471 {
5472         int commandoffset = thread->commandoffset;
5473         while (commandoffset != endoffset)
5474         {
5475                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5476                 switch (command->opcode)
5477                 {
5478 #define INTERPCOMMAND(name) \
5479                 case DPSOFTRAST_OPCODE_##name : \
5480                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5481                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5482                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5483                                 commandoffset = 0; \
5484                         break;
5485                 INTERPCOMMAND(Viewport)
5486                 INTERPCOMMAND(ClearColor)
5487                 INTERPCOMMAND(ClearDepth)
5488                 INTERPCOMMAND(ColorMask)
5489                 INTERPCOMMAND(DepthTest)
5490                 INTERPCOMMAND(ScissorTest)
5491                 INTERPCOMMAND(Scissor)
5492                 INTERPCOMMAND(BlendFunc)
5493                 INTERPCOMMAND(BlendSubtract)
5494                 INTERPCOMMAND(DepthMask)
5495                 INTERPCOMMAND(DepthFunc)
5496                 INTERPCOMMAND(DepthRange)
5497                 INTERPCOMMAND(PolygonOffset)
5498                 INTERPCOMMAND(CullFace)
5499                 INTERPCOMMAND(SetTexture)
5500                 INTERPCOMMAND(SetShader)
5501                 INTERPCOMMAND(Uniform4f)
5502                 INTERPCOMMAND(UniformMatrix4f)
5503                 INTERPCOMMAND(Uniform1i)
5504                 INTERPCOMMAND(SetRenderTargets)
5505                 INTERPCOMMAND(ClipPlane)
5506
5507                 case DPSOFTRAST_OPCODE_Draw:
5508                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5509                         commandoffset += command->commandsize;
5510                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5511                                 commandoffset = 0;
5512                         thread->commandoffset = commandoffset;
5513                         break;
5514
5515                 case DPSOFTRAST_OPCODE_Reset:
5516                         commandoffset = 0;
5517                         break;
5518                 }
5519         }
5520         thread->commandoffset = commandoffset;
5521 }
5522
5523 static int DPSOFTRAST_Draw_Thread(void *data)
5524 {
5525         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5526         while(thread->index >= 0)
5527         {
5528                 if (thread->commandoffset != dpsoftrast.drawcommand)
5529                 {
5530                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5531                 }
5532                 else 
5533                 {
5534                         Thread_LockMutex(thread->drawmutex);
5535                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5536                         {
5537                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5538                                 thread->starving = true;
5539                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5540                                 thread->starving = false;
5541                         }
5542                         Thread_UnlockMutex(thread->drawmutex);
5543                 }
5544         }   
5545         return 0;
5546 }
5547
5548 static void DPSOFTRAST_Draw_FlushThreads(void)
5549 {
5550         DPSOFTRAST_State_Thread *thread;
5551         int i;
5552         DPSOFTRAST_Draw_SyncCommands();
5553         if (dpsoftrast.usethreads) 
5554         {
5555                 for (i = 0; i < dpsoftrast.numthreads; i++)
5556                 {
5557                         thread = &dpsoftrast.threads[i];
5558                         if (thread->commandoffset != dpsoftrast.drawcommand)
5559                         {
5560                                 Thread_LockMutex(thread->drawmutex);
5561                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5562                                         Thread_CondSignal(thread->drawcond);
5563                                 Thread_UnlockMutex(thread->drawmutex);
5564                         }
5565                 }
5566                 for (i = 0; i < dpsoftrast.numthreads; i++)
5567                 {
5568                         thread = &dpsoftrast.threads[i];
5569                         if (thread->commandoffset != dpsoftrast.drawcommand)
5570                         {
5571                                 Thread_LockMutex(thread->drawmutex);
5572                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5573                                 {
5574                                         thread->waiting = true;
5575                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5576                                         thread->waiting = false;
5577                                 }
5578                                 Thread_UnlockMutex(thread->drawmutex);
5579                         }
5580                 }
5581         }
5582         else
5583         {
5584                 for (i = 0; i < dpsoftrast.numthreads; i++)
5585                 {
5586                         thread = &dpsoftrast.threads[i];
5587                         if (thread->commandoffset != dpsoftrast.drawcommand)
5588                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5589                 }
5590         }
5591         dpsoftrast.commandpool.usedcommands = 0;
5592 }
5593
5594 void DPSOFTRAST_Flush(void)
5595 {
5596         DPSOFTRAST_Draw_FlushThreads();
5597 }
5598
5599 void DPSOFTRAST_Finish(void)
5600 {
5601         DPSOFTRAST_Flush();
5602 }
5603
5604 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5605 {
5606         int i;
5607         union
5608         {
5609                 int i;
5610                 unsigned char b[4];
5611         }
5612         u;
5613         u.i = 1;
5614         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5615         dpsoftrast.bigendian = u.b[3];
5616         dpsoftrast.fb_width = width;
5617         dpsoftrast.fb_height = height;
5618         dpsoftrast.fb_depthpixels = depthpixels;
5619         dpsoftrast.fb_colorpixels[0] = colorpixels;
5620         dpsoftrast.fb_colorpixels[1] = NULL;
5621         dpsoftrast.fb_colorpixels[1] = NULL;
5622         dpsoftrast.fb_colorpixels[1] = NULL;
5623         dpsoftrast.viewport[0] = 0;
5624         dpsoftrast.viewport[1] = 0;
5625         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5626         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5627         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5628         dpsoftrast.texture_firstfree = 1;
5629         dpsoftrast.texture_end = 1;
5630         dpsoftrast.texture_max = 0;
5631         dpsoftrast.color[0] = 1;
5632         dpsoftrast.color[1] = 1;
5633         dpsoftrast.color[2] = 1;
5634         dpsoftrast.color[3] = 1;
5635         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5636         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5637         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5638         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5639         for (i = 0; i < dpsoftrast.numthreads; i++)
5640         {
5641                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5642                 thread->index = i;
5643                 thread->cullface = GL_BACK;
5644         thread->colormask[0] = 1; 
5645                 thread->colormask[1] = 1;
5646                 thread->colormask[2] = 1;
5647                 thread->colormask[3] = 1;
5648                 thread->blendfunc[0] = GL_ONE;
5649                 thread->blendfunc[1] = GL_ZERO;
5650                 thread->depthmask = true;
5651                 thread->depthtest = true;
5652                 thread->depthfunc = GL_LEQUAL;
5653                 thread->scissortest = false;
5654                 thread->viewport[0] = 0;
5655                 thread->viewport[1] = 0;
5656                 thread->viewport[2] = dpsoftrast.fb_width;
5657                 thread->viewport[3] = dpsoftrast.fb_height;
5658                 thread->scissor[0] = 0;
5659                 thread->scissor[1] = 0;
5660                 thread->scissor[2] = dpsoftrast.fb_width;
5661                 thread->scissor[3] = dpsoftrast.fb_height;
5662                 thread->depthrange[0] = 0;
5663                 thread->depthrange[1] = 1;
5664                 thread->polygonoffset[0] = 0;
5665                 thread->polygonoffset[1] = 0;
5666                 thread->clipplane[0] = 0;
5667                 thread->clipplane[1] = 0;
5668                 thread->clipplane[2] = 0;
5669                 thread->clipplane[3] = 1;
5670         
5671                 thread->numspans = 0;
5672                 thread->numtriangles = 0;
5673                 thread->commandoffset = 0;
5674                 thread->waiting = false;
5675                 thread->starving = false;
5676            
5677                 thread->validate = -1;
5678                 DPSOFTRAST_Validate(thread, -1);
5679  
5680                 if (dpsoftrast.usethreads)
5681                 {
5682                         thread->waitcond = Thread_CreateCond();
5683                         thread->drawcond = Thread_CreateCond();
5684                         thread->drawmutex = Thread_CreateMutex();
5685                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5686                 }
5687         }
5688         return 0;
5689 }
5690
5691 void DPSOFTRAST_Shutdown(void)
5692 {
5693         int i;
5694         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5695         {
5696                 DPSOFTRAST_State_Thread *thread;
5697                 for (i = 0; i < dpsoftrast.numthreads; i++)
5698                 {
5699                         thread = &dpsoftrast.threads[i];
5700                         Thread_LockMutex(thread->drawmutex);
5701                         thread->index = -1;
5702                         Thread_CondSignal(thread->drawcond);
5703                         Thread_UnlockMutex(thread->drawmutex);
5704                         Thread_WaitThread(thread->thread, 0);
5705                         Thread_DestroyCond(thread->waitcond);
5706                         Thread_DestroyCond(thread->drawcond);
5707                         Thread_DestroyMutex(thread->drawmutex);
5708                 }
5709         }
5710         for (i = 0;i < dpsoftrast.texture_end;i++)
5711                 if (dpsoftrast.texture[i].bytes)
5712                         MM_FREE(dpsoftrast.texture[i].bytes);
5713         if (dpsoftrast.texture)
5714                 free(dpsoftrast.texture);
5715         if (dpsoftrast.threads)
5716                 MM_FREE(dpsoftrast.threads);
5717         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5718 }
5719