]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
FinishBGRA8 optimization and fixes
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
77
78 static void *MM_CALLOC(size_t nmemb, size_t size)
79 {
80         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81         if (ptr != NULL) memset(ptr, 0, nmemb*size);
82         return ptr;
83 }
84
85 #define MM_FREE _mm_free
86 #else
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
89 #define MM_FREE free
90 #endif
91
92 typedef enum DPSOFTRAST_ARRAY_e
93 {
94         DPSOFTRAST_ARRAY_POSITION,
95         DPSOFTRAST_ARRAY_COLOR,
96         DPSOFTRAST_ARRAY_TEXCOORD0,
97         DPSOFTRAST_ARRAY_TEXCOORD1,
98         DPSOFTRAST_ARRAY_TEXCOORD2,
99         DPSOFTRAST_ARRAY_TEXCOORD3,
100         DPSOFTRAST_ARRAY_TEXCOORD4,
101         DPSOFTRAST_ARRAY_TEXCOORD5,
102         DPSOFTRAST_ARRAY_TEXCOORD6,
103         DPSOFTRAST_ARRAY_TEXCOORD7,
104         DPSOFTRAST_ARRAY_TOTAL
105 }
106 DPSOFTRAST_ARRAY;
107
108 typedef struct DPSOFTRAST_Texture_s
109 {
110         int flags;
111         int width;
112         int height;
113         int depth;
114         int sides;
115         DPSOFTRAST_TEXTURE_FILTER filter;
116         int mipmaps;
117         int size;
118         ATOMIC_COUNTER binds;
119         unsigned char *bytes;
120         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
121 }
122 DPSOFTRAST_Texture;
123
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
126
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
128 {
129         unsigned char opcode;
130         unsigned short commandsize;
131 }
132 DPSOFTRAST_Command);
133
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
135
136 #define DEFCOMMAND(opcodeval, name, fields) \
137         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
139         { \
140                 unsigned char opcode; \
141                 unsigned short commandsize; \
142                 fields \
143         } DPSOFTRAST_Command_##name );
144
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
147
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
149 {
150         int freecommand;
151         int usedcommands;
152         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
153 }
154 DPSOFTRAST_State_Command_Pool);
155
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
157 {
158         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
159         float w[3];
160         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
161 }
162 DPSOFTRAST_State_Triangle);
163
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
169 }
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
179 }
180                                         
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
182
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
184 {
185         int triangle; // triangle this span was generated by
186         int x; // framebuffer x coord
187         int y; // framebuffer y coord
188         int startx; // usable range (according to pixelmask)
189         int endx; // usable range (according to pixelmask)
190         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
192         int depthslope; // depthbuffer value pixel delta
193 }
194 DPSOFTRAST_State_Span);
195
196 #define DPSOFTRAST_DRAW_MAXSPANS 1024
197 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
198 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
199
200 #define DPSOFTRAST_VALIDATE_FB 1
201 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
202 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
203 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
204
205 typedef enum DPSOFTRAST_BLENDMODE_e
206 {
207         DPSOFTRAST_BLENDMODE_OPAQUE,
208         DPSOFTRAST_BLENDMODE_ALPHA,
209         DPSOFTRAST_BLENDMODE_ADDALPHA,
210         DPSOFTRAST_BLENDMODE_ADD,
211         DPSOFTRAST_BLENDMODE_INVMOD,
212         DPSOFTRAST_BLENDMODE_MUL,
213         DPSOFTRAST_BLENDMODE_MUL2,
214         DPSOFTRAST_BLENDMODE_SUBALPHA,
215         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
216         DPSOFTRAST_BLENDMODE_INVADD,
217         DPSOFTRAST_BLENDMODE_TOTAL
218 }
219 DPSOFTRAST_BLENDMODE;
220
221 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
222 {
223         void *thread;
224         int index;
225         
226         int cullface;
227         int colormask[4];
228         int blendfunc[2];
229         int blendsubtract;
230         int depthmask;
231         int depthtest;
232         int depthfunc;
233         int scissortest;
234         int alphatest;
235         int alphafunc;
236         float alphavalue;
237         int viewport[4];
238         int scissor[4];
239         float depthrange[2];
240         float polygonoffset[2];
241         float clipplane[4];
242         ALIGN(float fb_clipplane[4]);
243
244         int shader_mode;
245         int shader_permutation;
246         int shader_exactspecularmath;
247
248         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
249         
250         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
251         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
252
253         // DPSOFTRAST_VALIDATE_ flags
254         int validate;
255
256         // derived values (DPSOFTRAST_VALIDATE_FB)
257         int fb_colormask;
258         int fb_scissor[4];
259         ALIGN(float fb_viewportcenter[4]);
260         ALIGN(float fb_viewportscale[4]);
261
262         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
263         int fb_depthfunc;
264
265         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
266         int fb_blendmode;
267
268         // band boundaries
269         int miny1;
270         int maxy1;
271         int miny2;
272         int maxy2;
273
274         ATOMIC(volatile int commandoffset);
275
276         volatile bool waiting;
277         volatile bool starving;
278         void *waitcond;
279         void *drawcond;
280         void *drawmutex;
281
282         int numspans;
283         int numtriangles;
284         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
285         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
286         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
287 }
288 DPSOFTRAST_State_Thread);
289
290 typedef ATOMIC(struct DPSOFTRAST_State_s
291 {
292         int fb_width;
293         int fb_height;
294         unsigned int *fb_depthpixels;
295         unsigned int *fb_colorpixels[4];
296
297         int viewport[4];
298         ALIGN(float fb_viewportcenter[4]);
299         ALIGN(float fb_viewportscale[4]);
300
301         float color[4];
302         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
303         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
304
305         const float *pointer_vertex3f;
306         const float *pointer_color4f;
307         const unsigned char *pointer_color4ub;
308         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
309         int stride_vertex;
310         int stride_color;
311         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
312         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
313         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
314
315         int firstvertex;
316         int numvertices;
317         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
318         float *screencoord4f;
319         int drawstarty;
320         int drawendy;
321         int drawclipped;
322         
323         int shader_mode;
324         int shader_permutation;
325         int shader_exactspecularmath;
326
327         int texture_max;
328         int texture_end;
329         int texture_firstfree;
330         DPSOFTRAST_Texture *texture;
331
332         int bigendian;
333
334         // error reporting
335         const char *errorstring;
336
337         bool usethreads;
338         int interlace;
339         int numthreads;
340         DPSOFTRAST_State_Thread *threads;
341
342         ATOMIC(volatile int drawcommand);
343
344         DPSOFTRAST_State_Command_Pool commandpool;
345 }
346 DPSOFTRAST_State);
347
348 DPSOFTRAST_State dpsoftrast;
349
350 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
351 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
352 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
353 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
354
355 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
356 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
357
358 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
359 {
360         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
361         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
362         fb_viewportcenter[3] = 0.5f;
363         fb_viewportcenter[0] = 0.0f;
364         fb_viewportscale[1] = 0.5f * viewport[2];
365         fb_viewportscale[2] = -0.5f * viewport[3];
366         fb_viewportscale[3] = 0.5f;
367         fb_viewportscale[0] = 1.0f;
368 }
369
370 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
371 {
372         if (dpsoftrast.interlace)
373         {
374                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
375                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
376                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
377                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
378         }
379         else
380         {
381                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
382                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
383         }
384 }
385
386 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
387 {
388         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
389         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
390         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
391         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
392         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
393 }
394
395 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
396 {
397         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
398         // and viewport projection values
399         int x1, x2;
400         int y1, y2;
401         x1 = thread->scissor[0];
402         x2 = thread->scissor[0] + thread->scissor[2];
403         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
404         y2 = dpsoftrast.fb_height - thread->scissor[1];
405         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
406         if (x1 < 0) x1 = 0;
407         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
408         if (y1 < 0) y1 = 0;
409         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
410         thread->fb_scissor[0] = x1;
411         thread->fb_scissor[1] = y1;
412         thread->fb_scissor[2] = x2 - x1;
413         thread->fb_scissor[3] = y2 - y1;
414
415         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
416         DPSOFTRAST_RecalcClipPlane(thread);
417         DPSOFTRAST_RecalcThread(thread);
418 }
419
420 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
421 {
422         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
423 }
424
425 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
426 {
427         if (thread->blendsubtract)
428         {
429                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
430                 {
431                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
432                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
433                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
434                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
435                 }
436         }
437         else
438         {       
439                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
440                 {
441                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
442                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
443                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
444                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
445                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
446                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
447                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
448                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
449                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
450                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
451                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
452                 }
453         }
454 }
455
456 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
457
458 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
459 {
460         mask &= thread->validate;
461         if (!mask)
462                 return;
463         if (mask & DPSOFTRAST_VALIDATE_FB)
464         {
465                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
466                 DPSOFTRAST_RecalcFB(thread);
467         }
468         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
469         {
470                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
471                 DPSOFTRAST_RecalcDepthFunc(thread);
472         }
473         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
474         {
475                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
476                 DPSOFTRAST_RecalcBlendFunc(thread);
477         }
478 }
479
480 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
481 {
482         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
483                 return &dpsoftrast.texture[index];
484         return NULL;
485 }
486
487 static void DPSOFTRAST_Texture_Grow(void)
488 {
489         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
490         DPSOFTRAST_State_Thread *thread;
491         int i;
492         int j;
493         DPSOFTRAST_Flush();
494         // expand texture array as needed
495         if (dpsoftrast.texture_max < 1024)
496                 dpsoftrast.texture_max = 1024;
497         else
498                 dpsoftrast.texture_max *= 2;
499         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
500         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
501                 if (dpsoftrast.texbound[i])
502                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
503         for (j = 0; j < dpsoftrast.numthreads; j++)
504         {
505                 thread = &dpsoftrast.threads[j];
506                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
507                         if (thread->texbound[i])
508                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
509         }
510 }
511
512 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
513 {
514         int w;
515         int h;
516         int d;
517         int size;
518         int s;
519         int texnum;
520         int mipmaps;
521         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
522         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
523         DPSOFTRAST_Texture *texture;
524         if (width*height*depth < 1)
525         {
526                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
527                 return 0;
528         }
529         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
530         {
531                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
532                 return 0;
533         }
534         switch(texformat)
535         {
536         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
537         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
538         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
539                 break;
540         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
541                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
542                 {
543                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
544                         return 0;
545                 }
546                 if (depth != 1)
547                 {
548                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
549                         return 0;
550                 }
551                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
552                 {
553                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
554                         return 0;
555                 }
556                 break;
557         }
558         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
559         {
560                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
561                 return 0;
562         }
563         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
564         {
565                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
566                 return 0;
567         }
568         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569         {
570                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
571                 return 0;
572         }
573         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
576                 return 0;
577         }
578         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
581                 return 0;
582         }
583         // find first empty slot in texture array
584         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
585                 if (!dpsoftrast.texture[texnum].bytes)
586                         break;
587         dpsoftrast.texture_firstfree = texnum + 1;
588         if (dpsoftrast.texture_max <= texnum)
589                 DPSOFTRAST_Texture_Grow();
590         if (dpsoftrast.texture_end <= texnum)
591                 dpsoftrast.texture_end = texnum + 1;
592         texture = &dpsoftrast.texture[texnum];
593         memset(texture, 0, sizeof(*texture));
594         texture->flags = flags;
595         texture->width = width;
596         texture->height = height;
597         texture->depth = depth;
598         texture->sides = sides;
599         texture->binds = 0;
600         w = width;
601         h = height;
602         d = depth;
603         size = 0;
604         mipmaps = 0;
605         w = width;
606         h = height;
607         d = depth;
608         for (;;)
609         {
610                 s = w * h * d * sides * 4;
611                 texture->mipmap[mipmaps][0] = size;
612                 texture->mipmap[mipmaps][1] = s;
613                 texture->mipmap[mipmaps][2] = w;
614                 texture->mipmap[mipmaps][3] = h;
615                 texture->mipmap[mipmaps][4] = d;
616                 size += s;
617                 mipmaps++;
618                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
619                         break;
620                 if (w > 1) w >>= 1;
621                 if (h > 1) h >>= 1;
622                 if (d > 1) d >>= 1;
623         }
624         texture->mipmaps = mipmaps;
625         texture->size = size;
626
627         // allocate the pixels now
628         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
629
630         return texnum;
631 }
632 void DPSOFTRAST_Texture_Free(int index)
633 {
634         DPSOFTRAST_Texture *texture;
635         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636         if (texture->binds)
637                 DPSOFTRAST_Flush();
638         if (texture->bytes)
639                 MM_FREE(texture->bytes);
640         texture->bytes = NULL;
641         memset(texture, 0, sizeof(*texture));
642         // adjust the free range and used range
643         if (dpsoftrast.texture_firstfree > index)
644                 dpsoftrast.texture_firstfree = index;
645         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
646                 dpsoftrast.texture_end--;
647 }
648 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
649 {
650         int i, x, y, z, w, layer0, layer1, row0, row1;
651         unsigned char *o, *i0, *i1, *i2, *i3;
652         DPSOFTRAST_Texture *texture;
653         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654         if (texture->mipmaps <= 1)
655                 return;
656         for (i = 1;i < texture->mipmaps;i++)
657         {
658                 for (z = 0;z < texture->mipmap[i][4];z++)
659                 {
660                         layer0 = z*2;
661                         layer1 = z*2+1;
662                         if (layer1 >= texture->mipmap[i-1][4])
663                                 layer1 = texture->mipmap[i-1][4]-1;
664                         for (y = 0;y < texture->mipmap[i][3];y++)
665                         {
666                                 row0 = y*2;
667                                 row1 = y*2+1;
668                                 if (row1 >= texture->mipmap[i-1][3])
669                                         row1 = texture->mipmap[i-1][3]-1;
670                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
671                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
672                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
673                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
674                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
675                                 w = texture->mipmap[i][2];
676                                 if (layer1 > layer0)
677                                 {
678                                         if (texture->mipmap[i-1][2] > 1)
679                                         {
680                                                 // average 3D texture
681                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
682                                                 {
683                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
684                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
685                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
686                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
687                                                 }
688                                         }
689                                         else
690                                         {
691                                                 // average 3D mipmap with parent width == 1
692                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
693                                                 {
694                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
695                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
696                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
697                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
698                                                 }
699                                         }
700                                 }
701                                 else
702                                 {
703                                         if (texture->mipmap[i-1][2] > 1)
704                                         {
705                                                 // average 2D texture (common case)
706                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
707                                                 {
708                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
709                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
710                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
711                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
712                                                 }
713                                         }
714                                         else
715                                         {
716                                                 // 2D texture with parent width == 1
717                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
718                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
719                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
720                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
721                                         }
722                                 }
723                         }
724                 }
725         }
726 }
727 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
728 {
729         DPSOFTRAST_Texture *texture;
730         unsigned char *dst;
731         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
732         if (texture->binds)
733                 DPSOFTRAST_Flush();
734         if (pixels)
735         {
736                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
737                 while (blockheight > 0)
738                 {
739                         memcpy(dst, pixels, blockwidth * 4);
740                         pixels += blockwidth * 4;
741                         dst += texture->mipmap[0][2] * 4;
742                         blockheight--;
743                 }
744         }
745         DPSOFTRAST_Texture_CalculateMipmaps(index);
746 }
747 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
748 {
749         DPSOFTRAST_Texture *texture;
750         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
751         if (texture->binds)
752                 DPSOFTRAST_Flush();
753         if (pixels)
754                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
755         DPSOFTRAST_Texture_CalculateMipmaps(index);
756 }
757 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
758 {
759         DPSOFTRAST_Texture *texture;
760         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761         return texture->mipmap[mip][2];
762 }
763 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
764 {
765         DPSOFTRAST_Texture *texture;
766         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
767         return texture->mipmap[mip][3];
768 }
769 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
770 {
771         DPSOFTRAST_Texture *texture;
772         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
773         return texture->mipmap[mip][4];
774 }
775 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
776 {
777         DPSOFTRAST_Texture *texture;
778         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
779         if (texture->binds)
780                 DPSOFTRAST_Flush();
781         return texture->bytes + texture->mipmap[mip][0];
782 }
783 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
784 {
785         DPSOFTRAST_Texture *texture;
786         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
787         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
788         {
789                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
790                 return;
791         }
792         if (texture->binds)
793                 DPSOFTRAST_Flush();
794         texture->filter = filter;
795 }
796
797 static void DPSOFTRAST_Draw_FlushThreads(void);
798
799 static void DPSOFTRAST_Draw_SyncCommands(void)
800 {
801         if(dpsoftrast.usethreads) MEMORY_BARRIER;
802         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
803 }
804
805 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
806 {
807         DPSOFTRAST_State_Thread *thread;
808         int i;
809         int freecommand = dpsoftrast.commandpool.freecommand;
810         int usedcommands = dpsoftrast.commandpool.usedcommands;
811         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
812                 return;
813         DPSOFTRAST_Draw_SyncCommands();
814         for(;;)
815         {
816                 int waitindex = -1;
817                 int commandoffset;
818                 usedcommands = 0;
819                 for (i = 0; i < dpsoftrast.numthreads; i++)
820                 {
821                         thread = &dpsoftrast.threads[i]; 
822                         commandoffset = freecommand - thread->commandoffset;
823                         if (commandoffset < 0)
824                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
825                         if (commandoffset > usedcommands)
826                         {
827                                 waitindex = i;
828                                 usedcommands = commandoffset;
829                         }
830                 }
831                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
832                         break;
833                 thread = &dpsoftrast.threads[waitindex];
834                 Thread_LockMutex(thread->drawmutex);
835                 if (thread->commandoffset != dpsoftrast.drawcommand)
836                 {
837                         thread->waiting = true;
838                         if (thread->starving) Thread_CondSignal(thread->drawcond);
839                         Thread_CondWait(thread->waitcond, thread->drawmutex);
840                         thread->waiting = false;
841                 }
842                 Thread_UnlockMutex(thread->drawmutex);
843         }
844         dpsoftrast.commandpool.usedcommands = usedcommands;
845 }
846
847 #define DPSOFTRAST_ALIGNCOMMAND(size) \
848         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
849 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
850         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
851
852 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
853 {
854         DPSOFTRAST_Command *command;
855         int freecommand = dpsoftrast.commandpool.freecommand;
856         int usedcommands = dpsoftrast.commandpool.usedcommands;
857         int extra = sizeof(DPSOFTRAST_Command);
858         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
859                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
860         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
861         {
862                 if (dpsoftrast.usethreads)
863                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
864                 else
865                         DPSOFTRAST_Draw_FlushThreads();
866                 freecommand = dpsoftrast.commandpool.freecommand;
867                 usedcommands = dpsoftrast.commandpool.usedcommands;
868         }
869         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
870         {
871                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
872                 command->opcode = DPSOFTRAST_OPCODE_Reset;
873                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
874                 freecommand = 0;
875         }
876         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
877         command->opcode = opcode;
878         command->commandsize = size;
879         freecommand += size;
880         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
881                 freecommand = 0;
882         dpsoftrast.commandpool.freecommand = freecommand;
883         dpsoftrast.commandpool.usedcommands = usedcommands + size;
884         return command;
885 }
886
887 static void DPSOFTRAST_UndoCommand(int size)
888 {
889         int freecommand = dpsoftrast.commandpool.freecommand;
890         int usedcommands = dpsoftrast.commandpool.usedcommands;
891         freecommand -= size;
892         if (freecommand < 0)
893                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
894         usedcommands -= size;
895         dpsoftrast.commandpool.freecommand = freecommand;
896         dpsoftrast.commandpool.usedcommands = usedcommands;
897 }
898                 
899 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
900 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
901 {
902         thread->viewport[0] = command->x;
903         thread->viewport[1] = command->y;
904         thread->viewport[2] = command->width;
905         thread->viewport[3] = command->height;
906         thread->validate |= DPSOFTRAST_VALIDATE_FB;
907 }
908 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
909 {
910         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
911         command->x = x;
912         command->y = y;
913         command->width = width;
914         command->height = height;
915
916         dpsoftrast.viewport[0] = x;
917         dpsoftrast.viewport[1] = y;
918         dpsoftrast.viewport[2] = width;
919         dpsoftrast.viewport[3] = height;
920         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
921 }
922
923 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
924 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
925 {
926         int i, x1, y1, x2, y2, w, h, x, y;
927         int miny1, maxy1, miny2, maxy2;
928         int bandy;
929         unsigned int *p;
930         unsigned int c;
931         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
932         miny1 = thread->miny1;
933         maxy1 = thread->maxy1;
934         miny2 = thread->miny2;
935         maxy2 = thread->maxy2;
936         x1 = thread->fb_scissor[0];
937         y1 = thread->fb_scissor[1];
938         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
939         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
940         if (y1 < miny1) y1 = miny1;
941         if (y2 > maxy2) y2 = maxy2;
942         w = x2 - x1;
943         h = y2 - y1;
944         if (w < 1 || h < 1)
945                 return;
946         // FIXME: honor fb_colormask?
947         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
948         for (i = 0;i < 4;i++)
949         {
950                 if (!dpsoftrast.fb_colorpixels[i])
951                         continue;
952                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
953                 for (;y < bandy;y++)
954                 {
955                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
956                         for (x = x1;x < x2;x++)
957                                 p[x] = c;
958                 }
959         }
960 }
961 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
962 {
963         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
964         command->r = r;
965         command->g = g;
966         command->b = b;
967         command->a = a;
968 }
969
970 DEFCOMMAND(3, ClearDepth, float depth;)
971 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
972 {
973         int x1, y1, x2, y2, w, h, x, y;
974         int miny1, maxy1, miny2, maxy2;
975         int bandy;
976         unsigned int *p;
977         unsigned int c;
978         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
979         miny1 = thread->miny1;
980         maxy1 = thread->maxy1;
981         miny2 = thread->miny2;
982         maxy2 = thread->maxy2;
983         x1 = thread->fb_scissor[0];
984         y1 = thread->fb_scissor[1];
985         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
986         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
987         if (y1 < miny1) y1 = miny1;
988         if (y2 > maxy2) y2 = maxy2;
989         w = x2 - x1;
990         h = y2 - y1;
991         if (w < 1 || h < 1)
992                 return;
993         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
994         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
995         for (;y < bandy;y++)
996         {
997                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
998                 for (x = x1;x < x2;x++)
999                         p[x] = c;
1000         }
1001 }
1002 void DPSOFTRAST_ClearDepth(float d)
1003 {
1004         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1005         command->depth = d;
1006 }
1007
1008 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1009 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1010 {
1011         thread->colormask[0] = command->r != 0;
1012         thread->colormask[1] = command->g != 0;
1013         thread->colormask[2] = command->b != 0;
1014         thread->colormask[3] = command->a != 0;
1015         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1016 }
1017 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1018 {
1019         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1020         command->r = r;
1021         command->g = g;
1022         command->b = b;
1023         command->a = a;
1024 }
1025
1026 DEFCOMMAND(5, DepthTest, int enable;)
1027 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1028 {
1029         thread->depthtest = command->enable;
1030         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1031 }
1032 void DPSOFTRAST_DepthTest(int enable)
1033 {
1034         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1035         command->enable = enable;
1036 }
1037
1038 DEFCOMMAND(6, ScissorTest, int enable;)
1039 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1040 {
1041         thread->scissortest = command->enable;
1042         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1043 }
1044 void DPSOFTRAST_ScissorTest(int enable)
1045 {
1046         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1047         command->enable = enable;
1048 }
1049
1050 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1051 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1052 {
1053         thread->scissor[0] = command->x;
1054         thread->scissor[1] = command->y;
1055         thread->scissor[2] = command->width;
1056         thread->scissor[3] = command->height;
1057         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1058 }
1059 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1060 {
1061         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1062         command->x = x;
1063         command->y = y;
1064         command->width = width;
1065         command->height = height;
1066 }
1067
1068 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1069 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1070 {
1071         thread->blendfunc[0] = command->sfactor;
1072         thread->blendfunc[1] = command->dfactor;
1073         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1074 }
1075 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1076 {
1077         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1078         command->sfactor = sfactor;
1079         command->dfactor = dfactor;
1080 }
1081
1082 DEFCOMMAND(9, BlendSubtract, int enable;)
1083 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1084 {
1085         thread->blendsubtract = command->enable;
1086         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1087 }
1088 void DPSOFTRAST_BlendSubtract(int enable)
1089 {
1090         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1091         command->enable = enable;
1092 }
1093
1094 DEFCOMMAND(10, DepthMask, int enable;)
1095 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1096 {
1097         thread->depthmask = command->enable;
1098 }
1099 void DPSOFTRAST_DepthMask(int enable)
1100 {
1101         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1102         command->enable = enable;
1103 }
1104
1105 DEFCOMMAND(11, DepthFunc, int func;)
1106 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1107 {
1108         thread->depthfunc = command->func;
1109 }
1110 void DPSOFTRAST_DepthFunc(int func)
1111 {
1112         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1113         command->func = func;
1114 }
1115
1116 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1117 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1118 {
1119         thread->depthrange[0] = command->nearval;
1120         thread->depthrange[1] = command->farval;
1121 }
1122 void DPSOFTRAST_DepthRange(float nearval, float farval)
1123 {
1124         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1125         command->nearval = nearval;
1126         command->farval = farval;
1127 }
1128
1129 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1130 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1131 {
1132         thread->polygonoffset[0] = command->alongnormal;
1133         thread->polygonoffset[1] = command->intoview;
1134 }
1135 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1136 {
1137         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1138         command->alongnormal = alongnormal;
1139         command->intoview = intoview;
1140 }
1141
1142 DEFCOMMAND(14, CullFace, int mode;)
1143 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1144 {
1145         thread->cullface = command->mode;
1146 }
1147 void DPSOFTRAST_CullFace(int mode)
1148 {
1149         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1150         command->mode = mode;
1151 }
1152
1153 DEFCOMMAND(15, AlphaTest, int enable;)
1154 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1155 {
1156         thread->alphatest = command->enable;
1157 }
1158 void DPSOFTRAST_AlphaTest(int enable)
1159 {
1160         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1161         command->enable = enable;
1162 }
1163
1164 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1165 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1166 {
1167         thread->alphafunc = command->func;
1168         thread->alphavalue = command->ref;
1169 }
1170 void DPSOFTRAST_AlphaFunc(int func, float ref)
1171 {
1172         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1173         command->func = func;
1174         command->ref = ref;
1175 }
1176
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1178 {
1179         dpsoftrast.color[0] = r;
1180         dpsoftrast.color[1] = g;
1181         dpsoftrast.color[2] = b;
1182         dpsoftrast.color[3] = a;
1183 }
1184
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1186 {
1187         int outstride = blockwidth * 4;
1188         int instride = dpsoftrast.fb_width * 4;
1189         int bx1 = blockx;
1190         int by1 = blocky;
1191         int bx2 = blockx + blockwidth;
1192         int by2 = blocky + blockheight;
1193         int bw;
1194         int x;
1195         int y;
1196         unsigned char *inpixels;
1197         unsigned char *b;
1198         unsigned char *o;
1199         DPSOFTRAST_Flush();
1200         if (bx1 < 0) bx1 = 0;
1201         if (by1 < 0) by1 = 0;
1202         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1204         bw = bx2 - bx1;
1205         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206         if (dpsoftrast.bigendian)
1207         {
1208                 for (y = by1;y < by2;y++)
1209                 {
1210                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1212                         for (x = bx1;x < bx2;x++)
1213                         {
1214                                 o[0] = b[3];
1215                                 o[1] = b[2];
1216                                 o[2] = b[1];
1217                                 o[3] = b[0];
1218                                 o += 4;
1219                                 b += 4;
1220                         }
1221                 }
1222         }
1223         else
1224         {
1225                 for (y = by1;y < by2;y++)
1226                 {
1227                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1229                         memcpy(o, b, bw*4);
1230                 }
1231         }
1232
1233 }
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1235 {
1236         int tx1 = tx;
1237         int ty1 = ty;
1238         int tx2 = tx + width;
1239         int ty2 = ty + height;
1240         int sx1 = sx;
1241         int sy1 = sy;
1242         int sx2 = sx + width;
1243         int sy2 = sy + height;
1244         int swidth;
1245         int sheight;
1246         int twidth;
1247         int theight;
1248         int sw;
1249         int sh;
1250         int tw;
1251         int th;
1252         int y;
1253         unsigned int *spixels;
1254         unsigned int *tpixels;
1255         DPSOFTRAST_Texture *texture;
1256         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257         if (mip < 0 || mip >= texture->mipmaps) return;
1258         DPSOFTRAST_Flush();
1259         spixels = dpsoftrast.fb_colorpixels[0];
1260         swidth = dpsoftrast.fb_width;
1261         sheight = dpsoftrast.fb_height;
1262         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263         twidth = texture->mipmap[mip][2];
1264         theight = texture->mipmap[mip][3];
1265         if (tx1 < 0) tx1 = 0;
1266         if (ty1 < 0) ty1 = 0;
1267         if (tx2 > twidth) tx2 = twidth;
1268         if (ty2 > theight) ty2 = theight;
1269         if (sx1 < 0) sx1 = 0;
1270         if (sy1 < 0) sy1 = 0;
1271         if (sx2 > swidth) sx2 = swidth;
1272         if (sy2 > sheight) sy2 = sheight;
1273         tw = tx2 - tx1;
1274         th = ty2 - ty1;
1275         sw = sx2 - sx1;
1276         sh = sy2 - sy1;
1277         if (tw > sw) tw = sw;
1278         if (th > sh) th = sh;
1279         if (tw < 1 || th < 1)
1280                 return;
1281         sy1 = sheight - 1 - sy1;
1282         for (y = 0;y < th;y++)
1283                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1284         if (texture->mipmaps > 1)
1285                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1286 }
1287
1288 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1289 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1290 {
1291         if (thread->texbound[command->unitnum])
1292                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1293         thread->texbound[command->unitnum] = command->texture;
1294 }
1295 void DPSOFTRAST_SetTexture(int unitnum, int index)
1296 {
1297         DPSOFTRAST_Command_SetTexture *command;
1298         DPSOFTRAST_Texture *texture;
1299         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1300         {
1301                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1302                 return;
1303         }
1304         texture = DPSOFTRAST_Texture_GetByIndex(index);
1305         if (index && !texture)
1306         {
1307                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1308                 return;
1309         }
1310
1311         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1312         command->unitnum = unitnum;
1313         command->texture = texture;
1314
1315         dpsoftrast.texbound[unitnum] = texture;
1316         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1317 }
1318
1319 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1320 {
1321         dpsoftrast.pointer_vertex3f = vertex3f;
1322         dpsoftrast.stride_vertex = stride;
1323 }
1324 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1325 {
1326         dpsoftrast.pointer_color4f = color4f;
1327         dpsoftrast.pointer_color4ub = NULL;
1328         dpsoftrast.stride_color = stride;
1329 }
1330 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1331 {
1332         dpsoftrast.pointer_color4f = NULL;
1333         dpsoftrast.pointer_color4ub = color4ub;
1334         dpsoftrast.stride_color = stride;
1335 }
1336 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1337 {
1338         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1339         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1340         dpsoftrast.stride_texcoord[unitnum] = stride;
1341 }
1342
1343 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1344 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1345 {
1346         thread->shader_mode = command->mode;
1347         thread->shader_permutation = command->permutation;
1348         thread->shader_exactspecularmath = command->exactspecularmath;
1349 }
1350 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1351 {
1352         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1353         command->mode = mode;
1354         command->permutation = permutation;
1355         command->exactspecularmath = exactspecularmath;
1356
1357         dpsoftrast.shader_mode = mode;
1358         dpsoftrast.shader_permutation = permutation;
1359         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1360 }
1361
1362 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1363 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1364 {
1365         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1366 }
1367 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1368 {
1369         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1370         command->index = index;
1371         command->val[0] = v0;
1372         command->val[1] = v1;
1373         command->val[2] = v2;
1374         command->val[3] = v3;
1375
1376         dpsoftrast.uniform4f[index*4+0] = v0;
1377         dpsoftrast.uniform4f[index*4+1] = v1;
1378         dpsoftrast.uniform4f[index*4+2] = v2;
1379         dpsoftrast.uniform4f[index*4+3] = v3;
1380 }
1381 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1382 {
1383         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1384         command->index = index;
1385         memcpy(command->val, v, sizeof(command->val));
1386
1387         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1388 }
1389
1390 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1391 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1392 {
1393         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1394 }
1395 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1396 {
1397 #ifdef SSE_POSSIBLE
1398         int i, index;
1399         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1400         {
1401                 __m128 m0, m1, m2, m3;
1402                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1403                 command->index = (DPSOFTRAST_UNIFORM)index;
1404                 if (((size_t)v)&(ALIGN_SIZE-1))
1405                 {
1406                         m0 = _mm_loadu_ps(v);
1407                         m1 = _mm_loadu_ps(v+4);
1408                         m2 = _mm_loadu_ps(v+8);
1409                         m3 = _mm_loadu_ps(v+12);
1410                 }
1411                 else
1412                 {
1413                         m0 = _mm_load_ps(v);
1414                         m1 = _mm_load_ps(v+4);
1415                         m2 = _mm_load_ps(v+8);
1416                         m3 = _mm_load_ps(v+12);
1417                 }
1418                 if (transpose)
1419                 {
1420                         __m128 t0, t1, t2, t3;
1421                         t0 = _mm_unpacklo_ps(m0, m1);
1422                         t1 = _mm_unpacklo_ps(m2, m3);
1423                         t2 = _mm_unpackhi_ps(m0, m1);
1424                         t3 = _mm_unpackhi_ps(m2, m3);
1425                         m0 = _mm_movelh_ps(t0, t1);
1426                         m1 = _mm_movehl_ps(t1, t0);
1427                         m2 = _mm_movelh_ps(t2, t3);
1428                         m3 = _mm_movehl_ps(t3, t2);                     
1429                 }
1430                 _mm_store_ps(command->val, m0);
1431                 _mm_store_ps(command->val+4, m1);
1432                 _mm_store_ps(command->val+8, m2);
1433                 _mm_store_ps(command->val+12, m3);
1434                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1435                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1436                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1437                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1438         }
1439 #endif
1440 }
1441
1442 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1443 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1444 {
1445         thread->uniform1i[command->index] = command->val;
1446 }
1447 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1448 {
1449         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1450         command->index = index;
1451         command->val = i0;
1452
1453         dpsoftrast.uniform1i[command->index] = i0;
1454 }
1455
1456 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1457 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1458 {
1459         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1460         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1461 }
1462 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1463 {
1464         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1465         command->clipplane[0] = x;
1466         command->clipplane[1] = y;
1467         command->clipplane[2] = z;
1468         command->clipplane[3] = w;
1469 }
1470
1471 #ifdef SSE_POSSIBLE
1472 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1473 {
1474         float *end = dst + size*4;
1475         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1476         {
1477                 while (dst < end)
1478                 {
1479                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1480                         dst += 4;
1481                         src += stride;
1482                 }
1483         }
1484         else
1485         {
1486                 while (dst < end)
1487                 {
1488                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1489                         dst += 4;
1490                         src += stride;
1491                 }
1492         }
1493 }
1494
1495 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1496 {
1497         float *end = dst + size*4;
1498         if (stride == sizeof(float[3]))
1499         {
1500                 float *end4 = dst + (size&~3)*4;        
1501                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1502                 {
1503                         while (dst < end4)
1504                         {
1505                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1506                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1507                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1508                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1510                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1511                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1512                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1513                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1514                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1517                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1518                                 dst += 16;
1519                                 src += 4*sizeof(float[3]);
1520                         }
1521                 }
1522                 else
1523                 {
1524                         while (dst < end4)
1525                         {
1526                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1527                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1528                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1529                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1531                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1532                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1533                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1534                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1535                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1538                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1539                                 dst += 16;
1540                                 src += 4*sizeof(float[3]);
1541                         }
1542                 }
1543         }
1544         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1545         {
1546                 while (dst < end)
1547                 {
1548                         __m128 v = _mm_loadu_ps((const float *)src);
1549                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1550                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1551                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1552                         _mm_store_ps(dst, v);
1553                         dst += 4;
1554                         src += stride;
1555                 }
1556         }
1557         else
1558         {
1559                 while (dst < end)
1560                 {
1561                         __m128 v = _mm_load_ps((const float *)src);
1562                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1563                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1564                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1565                         _mm_store_ps(dst, v);
1566                         dst += 4;
1567                         src += stride;
1568                 }
1569         }
1570 }
1571
1572 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1573 {
1574         float *end = dst + size*4;
1575         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1576         if (stride == sizeof(float[2]))
1577         {
1578                 float *end2 = dst + (size&~1)*4;
1579                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1580                 {
1581                         while (dst < end2)
1582                         {
1583                                 __m128 v = _mm_loadu_ps((const float *)src);
1584                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1585                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1586                                 dst += 8;
1587                                 src += 2*sizeof(float[2]);
1588                         }
1589                 }
1590                 else
1591                 {
1592                         while (dst < end2)
1593                         {
1594                                 __m128 v = _mm_load_ps((const float *)src);
1595                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1596                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1597                                 dst += 8;
1598                                 src += 2*sizeof(float[2]);
1599                         }
1600                 }
1601         }
1602         while (dst < end)
1603         {
1604                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1605                 dst += 4;
1606                 src += stride;
1607         }
1608 }
1609
1610 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1611 {
1612         float *end = dst + size*4;
1613         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1614         if (stride == sizeof(unsigned char[4]))
1615         {
1616                 float *end4 = dst + (size&~3)*4;
1617                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1618                 {
1619                         while (dst < end4)
1620                         {
1621                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1622                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1623                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1624                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1625                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1626                                 dst += 16;
1627                                 src += 4*sizeof(unsigned char[4]);
1628                         }
1629                 }
1630                 else
1631                 {
1632                         while (dst < end4)
1633                         {
1634                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1635                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1636                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1637                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1638                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1639                                 dst += 16;
1640                                 src += 4*sizeof(unsigned char[4]);
1641                         }
1642                 }
1643         }
1644         while (dst < end)
1645         {
1646                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1647                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1648                 dst += 4;
1649                 src += stride;
1650         }
1651 }
1652
1653 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1654 {
1655         float *end = dst + 4*size;
1656         __m128 v = _mm_loadu_ps(src);
1657         while (dst < end)
1658         {
1659                 _mm_store_ps(dst, v);
1660                 dst += 4;
1661         }
1662 }
1663 #endif
1664
1665 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1666 {
1667 #ifdef SSE_POSSIBLE
1668         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1669         __m128 m0, m1, m2, m3;
1670         float *end;
1671         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1672         {
1673                 // fast case for identity matrix
1674                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1675                 return;
1676         }
1677         end = out4f + numitems*4;
1678         m0 = _mm_loadu_ps(inmatrix16f);
1679         m1 = _mm_loadu_ps(inmatrix16f + 4);
1680         m2 = _mm_loadu_ps(inmatrix16f + 8);
1681         m3 = _mm_loadu_ps(inmatrix16f + 12);
1682         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1683         {
1684                 while (out4f < end)
1685                 {
1686                         __m128 v = _mm_loadu_ps(in4f);
1687                         _mm_store_ps(out4f,
1688                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1689                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1690                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1691                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1692                         out4f += 4;
1693                         in4f += 4;
1694                 }
1695         }
1696         else
1697         {
1698                 while (out4f < end)
1699                 {
1700                         __m128 v = _mm_load_ps(in4f);
1701                         _mm_store_ps(out4f,
1702                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1703                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1704                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1705                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1706                         out4f += 4;
1707                         in4f += 4;
1708                 }
1709         }
1710 #endif
1711 }
1712
1713 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1714 {
1715         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1716 }
1717
1718 #ifdef SSE_POSSIBLE
1719 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1720 { \
1721         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1722         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1723         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1724         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1725 }
1726
1727 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1728 { \
1729         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1730         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1731         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1732         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1733 }
1734
1735 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1736 { \
1737         __m128 p = (in); \
1738         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1739                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1740                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1741                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1742 }
1743
1744 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1745 {
1746         int clipmask = 0xFF;
1747         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1748         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1749         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1750         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1751         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1752         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1753         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1754         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1755         #define BBFRONT(k, pos) \
1756         { \
1757                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1758                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1759                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1760                 { \
1761                         __m128 proj; \
1762                         clipmask &= ~(1<<k); \
1763                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1764                         minproj = _mm_min_ss(minproj, proj); \
1765                         maxproj = _mm_max_ss(maxproj, proj); \
1766                 } \
1767         }
1768         BBFRONT(0, minpos); 
1769         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1770         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1771         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1772         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1773         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1774         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1775         BBFRONT(7, maxpos);
1776         #define BBCLIP(k) \
1777         { \
1778                 if (clipmask&(1<<k)) \
1779                 { \
1780                         if (!(clipmask&(1<<(k^1)))) \
1781                         { \
1782                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1783                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1784                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1785                                 minproj = _mm_min_ss(minproj, proj); \
1786                                 maxproj = _mm_max_ss(maxproj, proj); \
1787                         } \
1788                         if (!(clipmask&(1<<(k^2)))) \
1789                         { \
1790                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1791                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1792                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1793                                 minproj = _mm_min_ss(minproj, proj); \
1794                                 maxproj = _mm_max_ss(maxproj, proj); \
1795                         } \
1796                         if (!(clipmask&(1<<(k^4)))) \
1797                         { \
1798                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1799                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1800                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1801                                 minproj = _mm_min_ss(minproj, proj); \
1802                                 maxproj = _mm_max_ss(maxproj, proj); \
1803                         } \
1804                 } \
1805         }
1806         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1807         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1808         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1809         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1810         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1811         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1812         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1813         *starty = _mm_cvttss_si32(maxproj);
1814         *endy = _mm_cvttss_si32(minproj)+1;
1815         return clipmask;
1816 }
1817         
1818 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1819 {
1820         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1821         float *end = out4f + numitems*4;
1822         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1823         __m128 minpos, maxpos;
1824         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1825         {
1826                 minpos = maxpos = _mm_loadu_ps(in4f);
1827                 while (out4f < end)
1828                 {
1829                         __m128 v = _mm_loadu_ps(in4f);
1830                         minpos = _mm_min_ps(minpos, v);
1831                         maxpos = _mm_max_ps(maxpos, v);
1832                         _mm_store_ps(out4f, v);
1833                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1834                         _mm_store_ps(screen4f, v);
1835                         in4f += 4;
1836                         out4f += 4;
1837                         screen4f += 4;
1838                 }
1839         }
1840         else
1841         {
1842                 minpos = maxpos = _mm_load_ps(in4f);
1843                 while (out4f < end)
1844                 {
1845                         __m128 v = _mm_load_ps(in4f);
1846                         minpos = _mm_min_ps(minpos, v);
1847                         maxpos = _mm_max_ps(maxpos, v);
1848                         _mm_store_ps(out4f, v);
1849                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1850                         _mm_store_ps(screen4f, v);
1851                         in4f += 4;
1852                         out4f += 4;
1853                         screen4f += 4;
1854                 }
1855         }
1856         if (starty && endy) 
1857         {
1858                 ALIGN(float minposf[4]);
1859                 ALIGN(float maxposf[4]);
1860                 _mm_store_ps(minposf, minpos);
1861                 _mm_store_ps(maxposf, maxpos);
1862                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1863         }
1864         return 0;
1865 }
1866
1867 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1868 {
1869         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1870         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1871         float *end;
1872         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1873                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1874         end = out4f + numitems*4;
1875         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1876         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1877         m0 = _mm_loadu_ps(inmatrix16f);
1878         m1 = _mm_loadu_ps(inmatrix16f + 4);
1879         m2 = _mm_loadu_ps(inmatrix16f + 8);
1880         m3 = _mm_loadu_ps(inmatrix16f + 12);
1881         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1882         {
1883                 minpos = maxpos = _mm_loadu_ps(in4f);
1884                 while (out4f < end)
1885                 {
1886                         __m128 v = _mm_loadu_ps(in4f);
1887                         minpos = _mm_min_ps(minpos, v);
1888                         maxpos = _mm_max_ps(maxpos, v);
1889                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1890                         _mm_store_ps(out4f, v);
1891                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1892                         _mm_store_ps(screen4f, v);
1893                         in4f += 4;
1894                         out4f += 4;
1895                         screen4f += 4;
1896                 }
1897         }
1898         else
1899         {
1900                 minpos = maxpos = _mm_load_ps(in4f);
1901                 while (out4f < end)
1902                 {
1903                         __m128 v = _mm_load_ps(in4f);
1904                         minpos = _mm_min_ps(minpos, v);
1905                         maxpos = _mm_max_ps(maxpos, v);
1906                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1907                         _mm_store_ps(out4f, v);
1908                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1909                         _mm_store_ps(screen4f, v);
1910                         in4f += 4;
1911                         out4f += 4;
1912                         screen4f += 4;
1913                 }
1914         }
1915         if (starty && endy) 
1916         {
1917                 ALIGN(float minposf[4]);
1918                 ALIGN(float maxposf[4]);
1919                 _mm_store_ps(minposf, minpos);
1920                 _mm_store_ps(maxposf, maxpos);
1921                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1922         }
1923         return 0;
1924 }
1925 #endif
1926
1927 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1928 {
1929 #ifdef SSE_POSSIBLE
1930         float *outf = dpsoftrast.post_array4f[outarray];
1931         const unsigned char *inb;
1932         int firstvertex = dpsoftrast.firstvertex;
1933         int numvertices = dpsoftrast.numvertices;
1934         int stride;
1935         switch(inarray)
1936         {
1937         case DPSOFTRAST_ARRAY_POSITION:
1938                 stride = dpsoftrast.stride_vertex;
1939                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1940                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1941                 break;
1942         case DPSOFTRAST_ARRAY_COLOR:
1943                 stride = dpsoftrast.stride_color;
1944                 if (dpsoftrast.pointer_color4f)
1945                 {
1946                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1947                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1948                 }
1949                 else if (dpsoftrast.pointer_color4ub)
1950                 {
1951                         stride = dpsoftrast.stride_color;
1952                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1953                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1954                 }
1955                 else
1956                 {
1957                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1958                 }
1959                 break;
1960         default:
1961                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1962                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1963                 {
1964                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1965                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1966                         {
1967                         case 2:
1968                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1969                                 break;
1970                         case 3:
1971                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1972                                 break;
1973                         case 4:
1974                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1975                                 break;
1976                         }
1977                 }
1978                 break;
1979         }
1980         return outf;
1981 #else
1982         return NULL;
1983 #endif
1984 }
1985
1986 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1987 {
1988         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1989         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1990         return data;
1991 }
1992
1993 #if 0
1994 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1995 {
1996 #ifdef SSE_POSSIBLE
1997         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1998         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1999         return data;
2000 #else
2001         return NULL;
2002 #endif
2003 }
2004 #endif
2005
2006 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2007 {
2008 #ifdef SSE_POSSIBLE
2009         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2010         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2011         return data;
2012 #else
2013         return NULL;
2014 #endif
2015 }
2016
2017 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2018 {
2019         int x;
2020         int startx = span->startx;
2021         int endx = span->endx;
2022         float wslope = triangle->w[0];
2023         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2024         float endz = 1.0f / (w + wslope * startx);
2025         if (triangle->w[0] == 0)
2026         {
2027                 // LordHavoc: fast flat polygons (HUD/menu)
2028                 for (x = startx;x < endx;x++)
2029                         zf[x] = endz;
2030                 return;
2031         }
2032         for (x = startx;x < endx;)
2033         {
2034                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2035                 float z = endz, dz;
2036                 if (nextsub >= endx) nextsub = endsub = endx-1;
2037                 endz = 1.0f / (w + wslope * nextsub);
2038                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2039                 for (; x <= endsub; x++, z += dz)
2040                         zf[x] = z;
2041         }
2042 }
2043
2044 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2045 {
2046 #ifdef SSE_POSSIBLE
2047         int x;
2048         int startx = span->startx;
2049         int endx = span->endx;
2050         int subx;
2051         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2052         unsigned char * RESTRICT pixelmask = span->pixelmask;
2053         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2054         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2055         if (!pixel)
2056                 return;
2057         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2058         pixeli += span->y * dpsoftrast.fb_width + span->x;
2059         // handle alphatest now (this affects depth writes too)
2060         if (thread->alphatest)
2061                 for (x = startx;x < endx;x++)
2062                         if (in4ub[x*4+3] < 128)
2063                                 pixelmask[x] = false;
2064         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2065         // helps sprites, text and hud artwork
2066         switch(thread->fb_blendmode)
2067         {
2068         case DPSOFTRAST_BLENDMODE_ALPHA:
2069         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2070         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2071                 for (x = startx;x < endx;x++)
2072                         if (in4ub[x*4+3] < 1)
2073                                 pixelmask[x] = false;
2074                 break;
2075         case DPSOFTRAST_BLENDMODE_OPAQUE:
2076         case DPSOFTRAST_BLENDMODE_ADD:
2077         case DPSOFTRAST_BLENDMODE_INVMOD:
2078         case DPSOFTRAST_BLENDMODE_MUL:
2079         case DPSOFTRAST_BLENDMODE_MUL2:
2080         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2081         case DPSOFTRAST_BLENDMODE_INVADD:
2082                 break;
2083         }
2084         // put some special values at the end of the mask to ensure the loops end
2085         pixelmask[endx] = 1;
2086         pixelmask[endx+1] = 0;
2087         // LordHavoc: use a double loop to identify subspans, this helps the
2088         // optimized copy/blend loops to perform at their best, most triangles
2089         // have only one run of pixels, and do the search using wide reads...
2090         x = startx;
2091         while (x < endx)
2092         {
2093                 // if this pixel is masked off, it's probably not alone...
2094                 if (!pixelmask[x])
2095                 {
2096                         x++;
2097 #if 1
2098                         if (x + 8 < endx)
2099                         {
2100                                 // the 4-item search must be aligned or else it stalls badly
2101                                 if ((x & 3) && !pixelmask[x]) 
2102                                 {
2103                                         if(pixelmask[x]) goto endmasked;
2104                                         x++;
2105                                         if (x & 3)
2106                                         {
2107                                                 if(pixelmask[x]) goto endmasked;
2108                                                 x++;
2109                                                 if (x & 3)
2110                                                 {
2111                                                         if(pixelmask[x]) goto endmasked;
2112                                                         x++;
2113                                                 }
2114                                         }
2115                                 }
2116                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2117                                         x += 4;
2118                         }
2119 #endif
2120                         for (;!pixelmask[x];x++)
2121                                 ;
2122                         // rather than continue the loop, just check the end variable
2123                         if (x >= endx)
2124                                 break;
2125                 }
2126         endmasked:
2127                 // find length of subspan
2128                 subx = x + 1;
2129 #if 1
2130                 if (subx + 8 < endx)
2131                 {
2132                         if (subx & 3)
2133                         {
2134                                 if(!pixelmask[subx]) goto endunmasked;
2135                                 subx++;
2136                                 if (subx & 3)
2137                                 {
2138                                         if(!pixelmask[subx]) goto endunmasked;
2139                                         subx++;
2140                                         if (subx & 3)
2141                                         {
2142                                                 if(!pixelmask[subx]) goto endunmasked;
2143                                                 subx++;
2144                                         }
2145                                 }
2146                         }
2147                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2148                                 subx += 4;
2149                 }
2150 #endif
2151                 for (;pixelmask[subx];subx++)
2152                         ;
2153                 // the checks can overshoot, so make sure to clip it...
2154                 if (subx > endx)
2155                         subx = endx;
2156         endunmasked:
2157                 // now that we know the subspan length...  process!
2158                 switch(thread->fb_blendmode)
2159                 {
2160                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2161 #if 0
2162                         if (subx - x >= 16)
2163                         {
2164                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2165                                 x = subx;
2166                         }
2167                         else
2168 #elif 1
2169                         while (x + 16 <= subx)
2170                         {
2171                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2172                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2173                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2174                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2175                                 x += 16;
2176                         }
2177 #endif
2178                         {
2179                                 while (x + 4 <= subx)
2180                                 {
2181                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2182                                         x += 4;
2183                                 }
2184                                 if (x + 2 <= subx)
2185                                 {
2186                                         pixeli[x] = ini[x];
2187                                         pixeli[x+1] = ini[x+1];
2188                                         x += 2;
2189                                 }
2190                                 if (x < subx)
2191                                 {
2192                                         pixeli[x] = ini[x];
2193                                         x++;
2194                                 }
2195                         }
2196                         break;
2197                 case DPSOFTRAST_BLENDMODE_ALPHA:
2198                 #define FINISHBLEND(blend2, blend1) \
2199                         for (;x + 1 < subx;x += 2) \
2200                         { \
2201                                 __m128i src, dst; \
2202                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2203                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2204                                 blend2; \
2205                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2206                         } \
2207                         if (x < subx) \
2208                         { \
2209                                 __m128i src, dst; \
2210                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2211                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2212                                 blend1; \
2213                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2214                                 x++; \
2215                         }
2216                         FINISHBLEND({
2217                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2218                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2219                         }, {
2220                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2221                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2222                         });
2223                         break;
2224                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2225                         FINISHBLEND({
2226                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2227                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2228                         }, {
2229                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2230                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2231                         });
2232                         break;
2233                 case DPSOFTRAST_BLENDMODE_ADD:
2234                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2235                         break;
2236                 case DPSOFTRAST_BLENDMODE_INVMOD:
2237                         FINISHBLEND({
2238                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2239                         }, {
2240                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2241                         });
2242                         break;
2243                 case DPSOFTRAST_BLENDMODE_MUL:
2244                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2245                         break;
2246                 case DPSOFTRAST_BLENDMODE_MUL2:
2247                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2248                         break;
2249                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2250                         FINISHBLEND({
2251                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2252                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2253                         }, {
2254                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2255                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2256                         });
2257                         break;
2258                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2259                         FINISHBLEND({
2260                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2261                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2262                         }, {
2263                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2264                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2265                         });
2266                         break;
2267                 case DPSOFTRAST_BLENDMODE_INVADD:
2268                         FINISHBLEND({
2269                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2270                         }, {
2271                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2272                         });
2273                         break;
2274                 }
2275         }
2276 #endif
2277 }
2278
2279 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2280 {
2281         int x;
2282         int startx = span->startx;
2283         int endx = span->endx;
2284         int flags;
2285         float c[4];
2286         float data[4];
2287         float slope[4];
2288         float tc[2], endtc[2];
2289         float tcscale[2];
2290         unsigned int tci[2];
2291         unsigned int tci1[2];
2292         unsigned int tcimin[2];
2293         unsigned int tcimax[2];
2294         int tciwrapmask[2];
2295         int tciwidth;
2296         int filter;
2297         int mip;
2298         const unsigned char * RESTRICT pixelbase;
2299         const unsigned char * RESTRICT pixel[4];
2300         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2301         // if no texture is bound, just fill it with white
2302         if (!texture)
2303         {
2304                 for (x = startx;x < endx;x++)
2305                 {
2306                         out4f[x*4+0] = 1.0f;
2307                         out4f[x*4+1] = 1.0f;
2308                         out4f[x*4+2] = 1.0f;
2309                         out4f[x*4+3] = 1.0f;
2310                 }
2311                 return;
2312         }
2313         mip = triangle->mip[texunitindex];
2314         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2315         // if this mipmap of the texture is 1 pixel, just fill it with that color
2316         if (texture->mipmap[mip][1] == 4)
2317         {
2318                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2319                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2320                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2321                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2322                 for (x = startx;x < endx;x++)
2323                 {
2324                         out4f[x*4+0] = c[0];
2325                         out4f[x*4+1] = c[1];
2326                         out4f[x*4+2] = c[2];
2327                         out4f[x*4+3] = c[3];
2328                 }
2329                 return;
2330         }
2331         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2332         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2333         flags = texture->flags;
2334         tcscale[0] = texture->mipmap[mip][2];
2335         tcscale[1] = texture->mipmap[mip][3];
2336         tciwidth = texture->mipmap[mip][2];
2337         tcimin[0] = 0;
2338         tcimin[1] = 0;
2339         tcimax[0] = texture->mipmap[mip][2]-1;
2340         tcimax[1] = texture->mipmap[mip][3]-1;
2341         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2342         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2343         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2344         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2345         if (filter)
2346         {
2347                 endtc[0] -= 0.5f;
2348                 endtc[1] -= 0.5f;
2349         }
2350         for (x = startx;x < endx;)
2351         {
2352                 unsigned int subtc[2];
2353                 unsigned int substep[2];
2354                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2355                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2356                 if (nextsub >= endx)
2357                 {
2358                         nextsub = endsub = endx-1;      
2359                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2360                 }
2361                 tc[0] = endtc[0];
2362                 tc[1] = endtc[1];
2363                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2364                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2365                 if (filter)
2366                 {
2367                         endtc[0] -= 0.5f;
2368                         endtc[1] -= 0.5f;
2369                 }
2370                 substep[0] = (endtc[0] - tc[0]) * subscale;
2371                 substep[1] = (endtc[1] - tc[1]) * subscale;
2372                 subtc[0] = tc[0] * (1<<12);
2373                 subtc[1] = tc[1] * (1<<12);
2374                 if (filter)
2375                 {
2376                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2377                         {
2378                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2379                                 {
2380                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2381                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2382                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2383                                         tci[0] = subtc[0]>>12;
2384                                         tci[1] = subtc[1]>>12;
2385                                         tci1[0] = tci[0] + 1;
2386                                         tci1[1] = tci[1] + 1;
2387                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2388                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2389                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2390                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2391                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2392                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2393                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2394                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2395                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2396                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2397                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2398                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2399                                         out4f[x*4+0] = c[0];
2400                                         out4f[x*4+1] = c[1];
2401                                         out4f[x*4+2] = c[2];
2402                                         out4f[x*4+3] = c[3];
2403                                 }
2404                         }
2405                         else
2406                         {
2407                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2408                                 {
2409                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2410                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2411                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2412                                         tci[0] = subtc[0]>>12;
2413                                         tci[1] = subtc[1]>>12;
2414                                         tci1[0] = tci[0] + 1;
2415                                         tci1[1] = tci[1] + 1;
2416                                         tci[0] &= tciwrapmask[0];
2417                                         tci[1] &= tciwrapmask[1];
2418                                         tci1[0] &= tciwrapmask[0];
2419                                         tci1[1] &= tciwrapmask[1];
2420                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2421                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2422                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2423                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2424                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2425                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2426                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2427                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2428                                         out4f[x*4+0] = c[0];
2429                                         out4f[x*4+1] = c[1];
2430                                         out4f[x*4+2] = c[2];
2431                                         out4f[x*4+3] = c[3];
2432                                 }
2433                         }
2434                 }
2435                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2436                 {
2437                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2438                         {
2439                                 tci[0] = subtc[0]>>12;
2440                                 tci[1] = subtc[1]>>12;
2441                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2442                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2443                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2444                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2445                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2446                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2447                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2448                                 out4f[x*4+0] = c[0];
2449                                 out4f[x*4+1] = c[1];
2450                                 out4f[x*4+2] = c[2];
2451                                 out4f[x*4+3] = c[3];
2452                         }
2453                 }
2454                 else
2455                 {
2456                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2457                         {
2458                                 tci[0] = subtc[0]>>12;
2459                                 tci[1] = subtc[1]>>12;
2460                                 tci[0] &= tciwrapmask[0];
2461                                 tci[1] &= tciwrapmask[1];
2462                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2463                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2464                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2465                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2466                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2467                                 out4f[x*4+0] = c[0];
2468                                 out4f[x*4+1] = c[1];
2469                                 out4f[x*4+2] = c[2];
2470                                 out4f[x*4+3] = c[3];
2471                         }
2472                 }
2473         }
2474 }
2475
2476 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2477 {
2478 #ifdef SSE_POSSIBLE
2479         int x;
2480         int startx = span->startx;
2481         int endx = span->endx;
2482         int flags;
2483         __m128 data, slope, tcscale;
2484         __m128i tcsize, tcmask, tcoffset, tcmax;
2485         __m128 tc, endtc;
2486         __m128i subtc, substep, endsubtc;
2487         int filter;
2488         int mip;
2489         int affine; // LordHavoc: optimized affine texturing case
2490         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2491         const unsigned char * RESTRICT pixelbase;
2492         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2493         // if no texture is bound, just fill it with white
2494         if (!texture)
2495         {
2496                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2497                 return;
2498         }
2499         mip = triangle->mip[texunitindex];
2500         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2501         // if this mipmap of the texture is 1 pixel, just fill it with that color
2502         if (texture->mipmap[mip][1] == 4)
2503         {
2504                 unsigned int k = *((const unsigned int *)pixelbase);
2505                 for (x = startx;x < endx;x++)
2506                         outi[x] = k;
2507                 return;
2508         }
2509         affine = zf[startx] == zf[endx-1];
2510         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2511         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2512         flags = texture->flags;
2513         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2514         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2515         tcscale = _mm_cvtepi32_ps(tcsize);
2516         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2517         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2518         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2519         if (filter)
2520                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2521         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2522         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2523         tcmax = _mm_packs_epi32(tcmask, tcmask);
2524         for (x = startx;x < endx;)
2525         {
2526                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2527                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2528                 if (nextsub >= endx || affine)
2529                 {
2530                         nextsub = endsub = endx-1;
2531                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2532                 }       
2533                 tc = endtc;
2534                 subtc = endsubtc;
2535                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2536                 if (filter)
2537                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2538                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2539                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2540                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2541                 substep = _mm_slli_epi32(substep, 1);
2542                 if (filter)
2543                 {
2544                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2545                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2546                         {
2547                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2548                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2549                                 {
2550                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2551                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2552                                         tci = _mm_madd_epi16(tci, tcoffset);
2553                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2554                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2555                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2556                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2557                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2558                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2559                                         fracm = _mm_srli_epi16(subtc, 1);
2560                                         pix1 = _mm_add_epi16(pix1,
2561                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2562                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2563                                         pix3 = _mm_add_epi16(pix3,
2564                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2565                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2566                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2567                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2568                                         pix2 = _mm_add_epi16(pix2,
2569                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2570                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2571                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2572                                 }
2573                                 if (x <= endsub)
2574                                 {
2575                                         const unsigned char * RESTRICT ptr1;
2576                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2577                                         tci = _mm_madd_epi16(tci, tcoffset);
2578                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2579                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2580                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2581                                         fracm = _mm_srli_epi16(subtc, 1);
2582                                         pix1 = _mm_add_epi16(pix1,
2583                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2584                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2585                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2586                                         pix1 = _mm_add_epi16(pix1,
2587                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2588                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2589                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2590                                         x++;
2591                                 }
2592                         }
2593                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2594                         {
2595                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2596                                 {
2597                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2598                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2599                                         tci = _mm_madd_epi16(tci, tcoffset);
2600                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2601                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2602                                                                                         _mm_setzero_si128());
2603                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2604                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2605                                                                                         _mm_setzero_si128());
2606                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2607                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2608                                         tci = _mm_madd_epi16(tci, tcoffset);
2609                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2610                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2611                                                                                         _mm_setzero_si128());
2612                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2613                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2614                                                                                         _mm_setzero_si128());
2615                                         fracm = _mm_srli_epi16(subtc, 1);
2616                                         pix1 = _mm_add_epi16(pix1,
2617                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2618                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2619                                         pix3 = _mm_add_epi16(pix3,
2620                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2621                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2622                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2623                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2624                                         pix2 = _mm_add_epi16(pix2,
2625                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2626                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2627                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2628                                 }
2629                                 if (x <= endsub)
2630                                 {
2631                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2632                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2633                                         tci = _mm_madd_epi16(tci, tcoffset);
2634                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2635                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2636                                                                                         _mm_setzero_si128());
2637                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2638                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2639                                                                                         _mm_setzero_si128());
2640                                         fracm = _mm_srli_epi16(subtc, 1);
2641                                         pix1 = _mm_add_epi16(pix1,
2642                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2643                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2644                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2645                                         pix1 = _mm_add_epi16(pix1,
2646                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2647                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2648                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2649                                         x++;
2650                                 }
2651                         }
2652                         else
2653                         {
2654                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2655                                 {
2656                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2657                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2658                                         tci = _mm_madd_epi16(tci, tcoffset);
2659                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2660                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2661                                                                                         _mm_setzero_si128());
2662                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2663                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2664                                                                                         _mm_setzero_si128());
2665                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2666                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2667                                         tci = _mm_madd_epi16(tci, tcoffset);
2668                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2669                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2670                                                                                         _mm_setzero_si128());
2671                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2672                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2673                                                                                         _mm_setzero_si128());
2674                                         fracm = _mm_srli_epi16(subtc, 1);
2675                                         pix1 = _mm_add_epi16(pix1,
2676                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2677                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2678                                         pix3 = _mm_add_epi16(pix3,
2679                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2680                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2681                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2682                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2683                                         pix2 = _mm_add_epi16(pix2,
2684                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2685                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2686                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2687                                 }
2688                                 if (x <= endsub)
2689                                 {
2690                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2691                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2692                                         tci = _mm_madd_epi16(tci, tcoffset);
2693                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2694                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2695                                                                                         _mm_setzero_si128());
2696                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2697                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2698                                                                                         _mm_setzero_si128());
2699                                         fracm = _mm_srli_epi16(subtc, 1);
2700                                         pix1 = _mm_add_epi16(pix1,
2701                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2702                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2703                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2704                                         pix1 = _mm_add_epi16(pix1,
2705                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2706                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2707                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2708                                         x++;
2709                                 }
2710                         }
2711                 }
2712                 else
2713                 {
2714                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2715                         {
2716                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2717                                 {
2718                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2719                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2720                                         tci = _mm_madd_epi16(tci, tcoffset);
2721                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2722                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2723                                 }
2724                                 if (x <= endsub)
2725                                 {
2726                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2727                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2728                                         tci = _mm_madd_epi16(tci, tcoffset);
2729                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2730                                         x++;
2731                                 }
2732                         }
2733                         else
2734                         {
2735                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2736                                 {
2737                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2738                                         tci = _mm_and_si128(tci, tcmax); 
2739                                         tci = _mm_madd_epi16(tci, tcoffset);
2740                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2741                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2742                                 }
2743                                 if (x <= endsub)
2744                                 {
2745                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2746                                         tci = _mm_and_si128(tci, tcmax); 
2747                                         tci = _mm_madd_epi16(tci, tcoffset);
2748                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2749                                         x++;
2750                                 }
2751                         }
2752                 }
2753         }
2754 #endif
2755 }
2756
2757 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2758 {
2759         // TODO: IMPLEMENT
2760         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2761 }
2762
2763 float DPSOFTRAST_SampleShadowmap(const float *vector)
2764 {
2765         // TODO: IMPLEMENT
2766         return 1.0f;
2767 }
2768
2769 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2770 {
2771         int x;
2772         int startx = span->startx;
2773         int endx = span->endx;
2774         float c[4];
2775         float data[4];
2776         float slope[4];
2777         float z;
2778         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2779         for (x = startx;x < endx;x++)
2780         {
2781                 z = zf[x];
2782                 c[0] = (data[0] + slope[0]*x) * z;
2783                 c[1] = (data[1] + slope[1]*x) * z;
2784                 c[2] = (data[2] + slope[2]*x) * z;
2785                 c[3] = (data[3] + slope[3]*x) * z;
2786                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2787                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2788                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2789                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2790         }
2791 }
2792
2793 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2794 {
2795         int x;
2796         int startx = span->startx;
2797         int endx = span->endx;
2798         float c[4];
2799         float data[4];
2800         float slope[4];
2801         float z;
2802         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2803         for (x = startx;x < endx;x++)
2804         {
2805                 z = zf[x];
2806                 c[0] = (data[0] + slope[0]*x) * z;
2807                 c[1] = (data[1] + slope[1]*x) * z;
2808                 c[2] = (data[2] + slope[2]*x) * z;
2809                 c[3] = (data[3] + slope[3]*x) * z;
2810                 out4f[x*4+0] = c[0];
2811                 out4f[x*4+1] = c[1];
2812                 out4f[x*4+2] = c[2];
2813                 out4f[x*4+3] = c[3];
2814         }
2815 }
2816
2817 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2818 {
2819         int x, startx = span->startx, endx = span->endx;
2820         float c[4], localcolor[4];
2821         localcolor[0] = subcolor[0];
2822         localcolor[1] = subcolor[1];
2823         localcolor[2] = subcolor[2];
2824         localcolor[3] = subcolor[3];
2825         for (x = startx;x < endx;x++)
2826         {
2827                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2828                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2829                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2830                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2831                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2832                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2833                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2834                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2835         }
2836 }
2837
2838 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2839 {
2840         int x, startx = span->startx, endx = span->endx;
2841         for (x = startx;x < endx;x++)
2842         {
2843                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2844                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2845                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2846                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2847         }
2848 }
2849
2850 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2851 {
2852         int x, startx = span->startx, endx = span->endx;
2853         for (x = startx;x < endx;x++)
2854         {
2855                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2856                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2857                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2858                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2859         }
2860 }
2861
2862 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2863 {
2864         int x, startx = span->startx, endx = span->endx;
2865         float a, b;
2866         for (x = startx;x < endx;x++)
2867         {
2868                 a = 1.0f - inb4f[x*4+3];
2869                 b = inb4f[x*4+3];
2870                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2871                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2872                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2873                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2874         }
2875 }
2876
2877 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2878 {
2879         int x, startx = span->startx, endx = span->endx;
2880         float localcolor[4], ilerp, lerp;
2881         localcolor[0] = color[0];
2882         localcolor[1] = color[1];
2883         localcolor[2] = color[2];
2884         localcolor[3] = color[3];
2885         ilerp = 1.0f - localcolor[3];
2886         lerp = localcolor[3];
2887         for (x = startx;x < endx;x++)
2888         {
2889                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2890                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2891                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2892                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2893         }
2894 }
2895
2896
2897
2898 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2899 {
2900 #ifdef SSE_POSSIBLE
2901         int x;
2902         int startx = span->startx;
2903         int endx = span->endx;
2904         __m128 data, slope;
2905         __m128 mod, endmod;
2906         __m128i submod, substep, endsubmod;
2907         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2908         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2909         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2910         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2911         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2912         for (x = startx; x < endx;)
2913         {
2914                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2915                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2916                 if (nextsub >= endx)
2917                 {
2918                         nextsub = endsub = endx-1;
2919                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2920                 }
2921                 mod = endmod;
2922                 submod = endsubmod;
2923                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2924                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2925                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2926                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2927                 substep = _mm_packs_epi32(substep, substep);
2928                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2929                 {
2930                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2931                         pix = _mm_mulhi_epu16(pix, submod);
2932                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2933                 }
2934                 if (x <= endsub)
2935                 {
2936                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2937                         pix = _mm_mulhi_epu16(pix, submod);
2938                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2939                         x++;
2940                 }
2941         }
2942 #endif
2943 }
2944
2945 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2946 {
2947 #ifdef SSE_POSSIBLE
2948         int x;
2949         int startx = span->startx;
2950         int endx = span->endx;
2951         __m128 data, slope;
2952         __m128 mod, endmod;
2953         __m128i submod, substep, endsubmod;
2954         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2955         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2956         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2957         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2958         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2959         for (x = startx; x < endx;)
2960         {
2961                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2962                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2963                 if (nextsub >= endx)
2964                 {
2965                         nextsub = endsub = endx-1;
2966                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2967                 }
2968                 mod = endmod;
2969                 submod = endsubmod;
2970                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2971                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2972                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2973                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2974                 substep = _mm_packs_epi32(substep, substep);
2975                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2976                 {
2977                         __m128i pix = _mm_srai_epi16(submod, 4);
2978                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2979                 }
2980                 if (x <= endsub)
2981                 {
2982                         __m128i pix = _mm_srai_epi16(submod, 4);
2983                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2984                         x++;
2985                 }
2986         }
2987 #endif
2988 }
2989
2990 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2991 {
2992 #ifdef SSE_POSSIBLE
2993         int x, startx = span->startx, endx = span->endx;
2994         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2995         localcolor = _mm_packs_epi32(localcolor, localcolor);
2996         for (x = startx;x+2 <= endx;x+=2)
2997         {
2998                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2999                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3000                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3001                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3002         }
3003         if (x < endx)
3004         {
3005                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3006                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3007                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3008                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3009         }
3010 #endif
3011 }
3012
3013 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3014 {
3015 #ifdef SSE_POSSIBLE
3016         int x, startx = span->startx, endx = span->endx;
3017         for (x = startx;x+2 <= endx;x+=2)
3018         {
3019                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3020                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3021                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3022                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3023         }
3024         if (x < endx)
3025         {
3026                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3027                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3028                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3029                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3030         }
3031 #endif
3032 }
3033
3034 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3035 {
3036 #ifdef SSE_POSSIBLE
3037         int x, startx = span->startx, endx = span->endx;
3038         for (x = startx;x+2 <= endx;x+=2)
3039         {
3040                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3041                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3042                 pix1 = _mm_add_epi16(pix1, pix2);
3043                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3044         }
3045         if (x < endx)
3046         {
3047                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3048                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3049                 pix1 = _mm_add_epi16(pix1, pix2);
3050                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3051         }
3052 #endif
3053 }
3054
3055 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3056 {
3057 #ifdef SSE_POSSIBLE
3058         int x, startx = span->startx, endx = span->endx;
3059         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3060         tint = _mm_packs_epi32(tint, tint);
3061         for (x = startx;x+2 <= endx;x+=2)
3062         {
3063                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3064                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3065                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3066                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3067         }
3068         if (x < endx)
3069         {
3070                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3071                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3072                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3073                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3074         }
3075 #endif
3076 }
3077
3078 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3079 {
3080 #ifdef SSE_POSSIBLE
3081         int x, startx = span->startx, endx = span->endx;
3082         for (x = startx;x+2 <= endx;x+=2)
3083         {
3084                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3085                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3086                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3087                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3088                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3089         }
3090         if (x < endx)
3091         {
3092                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3093                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3094                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3095                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3096                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3097         }
3098 #endif
3099 }
3100
3101 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3102 {
3103 #ifdef SSE_POSSIBLE
3104         int x, startx = span->startx, endx = span->endx;
3105         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3106         localcolor = _mm_packs_epi32(localcolor, localcolor);
3107         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3108         for (x = startx;x+2 <= endx;x+=2)
3109         {
3110                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3111                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3112                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3113         }
3114         if (x < endx)
3115         {
3116                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3117                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3118                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3119         }
3120 #endif
3121 }
3122
3123
3124
3125 void DPSOFTRAST_VertexShader_Generic(void)
3126 {
3127         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3128         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3129         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3130         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3131                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3132 }
3133
3134 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3135 {
3136         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3137         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3138         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3139         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3140         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3141         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3142         {
3143                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3144                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3145                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3146                 {
3147                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3148                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3149                         {
3150                                 // multiply
3151                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3152                         }
3153                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3154                         {
3155                                 // add
3156                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3157                         }
3158                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3159                         {
3160                                 // alphablend
3161                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3162                         }
3163                 }
3164         }
3165         else
3166                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3167         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3168 }
3169
3170
3171
3172 void DPSOFTRAST_VertexShader_PostProcess(void)
3173 {
3174         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3175         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3176         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3177 }
3178
3179 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3180 {
3181         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3182         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3183         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3184         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3185         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3186         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3187         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3188         {
3189                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3190                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3191         }
3192         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3193         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3194         {
3195                 // TODO: implement saturation
3196         }
3197         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3198         {
3199                 // TODO: implement gammaramps
3200         }
3201         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3202 }
3203
3204
3205
3206 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3207 {
3208         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3209 }
3210
3211 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3212 {
3213         // this is never called (because colormask is off when this shader is used)
3214         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3215         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3216         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3217         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3218         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3219 }
3220
3221
3222
3223 void DPSOFTRAST_VertexShader_FlatColor(void)
3224 {
3225         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3226         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3227 }
3228
3229 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3230 {
3231 #ifdef SSE_POSSIBLE
3232         unsigned char * RESTRICT pixelmask = span->pixelmask;
3233         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3234         int x, startx = span->startx, endx = span->endx;
3235         __m128i Color_Ambientm;
3236         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3237         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3238         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3239         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3240         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3241         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3242                 pixel = buffer_FragColorbgra8;
3243         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3244         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3245         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3246         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3247         for (x = startx;x < endx;x++)
3248         {
3249                 __m128i color, pix;
3250                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3251                 {
3252                         __m128i pix2;
3253                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3254                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3255                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3256                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3257                         x += 3;
3258                         continue;
3259                 }
3260                 if (!pixelmask[x])
3261                         continue;
3262                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3263                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3264                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3265         }
3266         if (pixel == buffer_FragColorbgra8)
3267                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3268 #endif
3269 }
3270
3271
3272
3273 void DPSOFTRAST_VertexShader_VertexColor(void)
3274 {
3275         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3276         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3277         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3278 }
3279
3280 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3281 {
3282 #ifdef SSE_POSSIBLE
3283         unsigned char * RESTRICT pixelmask = span->pixelmask;
3284         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3285         int x, startx = span->startx, endx = span->endx;
3286         __m128i Color_Ambientm, Color_Diffusem;
3287         __m128 data, slope;
3288         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3289         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3290         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3291         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3292         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3293         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3294         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3295                 pixel = buffer_FragColorbgra8;
3296         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3297         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3298         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3299         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3300         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3301         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3302         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3303         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3304         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3305         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3306         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3307         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3308         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3309         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3310         {
3311                 __m128i color, mod, pix;
3312                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3313                 {
3314                         __m128i pix2, mod2;
3315                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3316                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3317                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3318                         data = _mm_add_ps(data, slope);
3319                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3320                         data = _mm_add_ps(data, slope);
3321                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3322                         data = _mm_add_ps(data, slope);
3323                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3324                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3325                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3326                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3327                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3328                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3329                         x += 3;
3330                         continue;
3331                 }
3332                 if (!pixelmask[x])
3333                         continue;
3334                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3335                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3336                 mod = _mm_packs_epi32(mod, mod);
3337                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3338                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3339         }
3340         if (pixel == buffer_FragColorbgra8)
3341                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3342 #endif
3343 }
3344
3345
3346
3347 void DPSOFTRAST_VertexShader_Lightmap(void)
3348 {
3349         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3350         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3351         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3352 }
3353
3354 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3355 {
3356 #ifdef SSE_POSSIBLE
3357         unsigned char * RESTRICT pixelmask = span->pixelmask;
3358         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3359         int x, startx = span->startx, endx = span->endx;
3360         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3361         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3362         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3363         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3364         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3365         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3366         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3367         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3368         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3369         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3370                 pixel = buffer_FragColorbgra8;
3371         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3372         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3373         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3374         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3375         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3376         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3377         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3378         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3379         {
3380                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3381                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3382                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3383                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3384                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3385                 for (x = startx;x < endx;x++)
3386                 {
3387                         __m128i color, lightmap, glow, pix;
3388                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3389                         {
3390                                 __m128i pix2;
3391                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3392                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3393                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3394                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3395                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3396                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3397                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3398                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3399                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3400                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3401                                 x += 3;
3402                                 continue;
3403                         }
3404                         if (!pixelmask[x])
3405                                 continue;
3406                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3407                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3408                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3409                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3410                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3411                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3412                 }
3413         }
3414         else
3415         {
3416                 for (x = startx;x < endx;x++)
3417                 {
3418                         __m128i color, lightmap, pix;
3419                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3420                         {
3421                                 __m128i pix2;
3422                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3423                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3424                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3425                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3426                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3427                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3428                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3429                                 x += 3;
3430                                 continue;
3431                         }
3432                         if (!pixelmask[x]) 
3433                                 continue;
3434                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3435                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3436                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3437                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3438                 }
3439         }
3440         if (pixel == buffer_FragColorbgra8)
3441                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3442 #endif
3443 }
3444
3445
3446 void DPSOFTRAST_VertexShader_LightDirection(void);
3447 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3448
3449 void DPSOFTRAST_VertexShader_FakeLight(void)
3450 {
3451         DPSOFTRAST_VertexShader_LightDirection();
3452 }
3453
3454 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3455 {
3456         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3457 }
3458
3459
3460
3461 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3462 {
3463         DPSOFTRAST_VertexShader_LightDirection();
3464         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3465 }
3466
3467 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3468 {
3469         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3470 }
3471
3472
3473
3474 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3475 {
3476         DPSOFTRAST_VertexShader_LightDirection();
3477         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3478 }
3479
3480 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3481 {
3482         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3483 }
3484
3485
3486
3487 void DPSOFTRAST_VertexShader_LightDirection(void)
3488 {
3489         int i;
3490         int numvertices = dpsoftrast.numvertices;
3491         float LightDir[4];
3492         float LightVector[4];
3493         float EyePosition[4];
3494         float EyeVectorModelSpace[4];
3495         float EyeVector[4];
3496         float position[4];
3497         float svector[4];
3498         float tvector[4];
3499         float normal[4];
3500         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3501         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3502         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3503         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3504         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3505         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3506         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3507         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3508         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3509         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3510         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3511         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3512         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3513         for (i = 0;i < numvertices;i++)
3514         {
3515                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3516                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3517                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3518                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3519                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3520                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3521                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3522                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3523                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3524                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3525                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3526                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3527                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3528                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3529                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3530                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3531                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3532                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3533                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3534                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3535                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3536                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3537                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3538                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3539                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3540                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3541                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3542                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3543                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3544         }
3545         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3546 }
3547
3548 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3549 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3550 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3551 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3552 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3553 #define DPSOFTRAST_Vector3Normalize(v)\
3554 do\
3555 {\
3556         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3557         if (len)\
3558         {\
3559                 len = 1.0f / len;\
3560                 v[0] *= len;\
3561                 v[1] *= len;\
3562                 v[2] *= len;\
3563         }\
3564 }\
3565 while(0)
3566
3567 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3568 {
3569         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3570         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3571         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3572         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3573         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3574         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3575         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3576         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3577         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3578         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3579         int x, startx = span->startx, endx = span->endx;
3580         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3581         float LightVectordata[4];
3582         float LightVectorslope[4];
3583         float EyeVectordata[4];
3584         float EyeVectorslope[4];
3585         float VectorSdata[4];
3586         float VectorSslope[4];
3587         float VectorTdata[4];
3588         float VectorTslope[4];
3589         float VectorRdata[4];
3590         float VectorRslope[4];
3591         float z;
3592         float diffusetex[4];
3593         float glosstex[4];
3594         float surfacenormal[4];
3595         float lightnormal[4];
3596         float lightnormal_modelspace[4];
3597         float eyenormal[4];
3598         float specularnormal[4];
3599         float diffuse;
3600         float specular;
3601         float SpecularPower;
3602         int d[4];
3603         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3604         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3605         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3606         Color_Glow[3] = 0.0f;
3607         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3608         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3609         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3610         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3611         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3612         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3613         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3614         Color_Pants[3] = 0.0f;
3615         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3616         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3617         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3618         Color_Shirt[3] = 0.0f;
3619         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3620         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3621         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3622         {
3623                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3624                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3625         }
3626         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3627         {
3628                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3629         }
3630         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3631         {
3632                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3633                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3634                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3635                 Color_Diffuse[3] = 0.0f;
3636                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3637                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3638                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3639                 LightColor[3] = 0.0f;
3640                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3641                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3642                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3643                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3644                 Color_Specular[3] = 0.0f;
3645                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3646                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3647                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3648
3649                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3650                 {
3651                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3652                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3653                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3654                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3655                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3656                 }
3657                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3658                 {
3659                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3660                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3661                 }
3662                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3663                 {
3664                         // nothing of this needed
3665                 }
3666                 else
3667                 {
3668                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3669                 }
3670
3671                 for (x = startx;x < endx;x++)
3672                 {
3673                         z = buffer_z[x];
3674                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3675                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3676                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3677                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3678                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3679                         {
3680                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3681                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3682                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3683                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3684                         }
3685                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3686                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3687                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3688                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3689                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3690                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3691                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3692                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3693
3694                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3695                         {
3696                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3697                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3698                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3699                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3700
3701                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3702                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3703                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3704                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3705
3706                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3707                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3708                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3709                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3710
3711                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3712                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3713                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3714                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3715
3716                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3717                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3718
3719                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3720                                 {
3721                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3722                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3723                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3724                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3725                                 }
3726                         }
3727                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3728                         {
3729                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3730                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3731                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3732                                 {
3733                                         float f = 1.0f / 256.0f;
3734                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3735                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3736                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3737                                 }
3738                         }
3739                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3740                         {
3741                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3742                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3743                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3744                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3745
3746                                 LightColor[0] = 1.0;
3747                                 LightColor[1] = 1.0;
3748                                 LightColor[2] = 1.0;
3749                         }
3750                         else
3751                         {
3752                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3753                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3754                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3755                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3756                         }
3757
3758                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3759
3760                         if(thread->shader_exactspecularmath)
3761                         {
3762                                 // reflect lightnormal at surfacenormal, take the negative of that
3763                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3764                                 float f;
3765                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3766                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3767                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3768                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3769
3770                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3771                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3772                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3773                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3774                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3775
3776                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3777                         }
3778                         else
3779                         {
3780                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3781                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3782                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3783                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3784
3785                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3786                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3787                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3788                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3789
3790                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3791                         }
3792
3793                         specular = pow(specular, SpecularPower * glosstex[3]);
3794                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3795                         {
3796                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3797                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3798                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3799                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3800                         }
3801                         else
3802                         {
3803                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3804                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3805                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3806                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3807                         }
3808
3809                         buffer_FragColorbgra8[x*4+0] = d[0];
3810                         buffer_FragColorbgra8[x*4+1] = d[1];
3811                         buffer_FragColorbgra8[x*4+2] = d[2];
3812                         buffer_FragColorbgra8[x*4+3] = d[3];
3813                 }
3814         }
3815         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3816         {
3817                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3818                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3819                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3820                 Color_Diffuse[3] = 0.0f;
3821                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3822                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3823                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3824                 LightColor[3] = 0.0f;
3825                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3826
3827                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3828                 {
3829                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3830                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3831                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3832                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3833                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3834                 }
3835                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3836                 {
3837                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3838                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3839                 }
3840                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3841                 {
3842                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3843                 }
3844                 else
3845                 {
3846                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3847                 }
3848
3849                 for (x = startx;x < endx;x++)
3850                 {
3851                         z = buffer_z[x];
3852                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3853                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3854                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3855                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3856                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3857                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3858                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3859                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3860
3861                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3862                         {
3863                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3864                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3865                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3866                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3867
3868                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3869                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3870                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3871                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3872
3873                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3874                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3875                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3876                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3877
3878                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3879                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3880                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3881                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3882
3883                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3884                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3885
3886                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3887                                 {
3888                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3889                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3890                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3891                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3892                                 }
3893                         }
3894                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3895                         {
3896                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3897                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3898                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3899                                 {
3900                                         float f = 1.0f / 256.0f;
3901                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3902                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3903                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3904                                 }
3905                         }
3906                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3907                         {
3908                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3909                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3910                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3911                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3912
3913                                 LightColor[0] = 1.0;
3914                                 LightColor[1] = 1.0;
3915                                 LightColor[2] = 1.0;
3916                         }
3917                         else
3918                         {
3919                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3920                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3921                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3922                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3923                         }
3924
3925                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3926                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3927                         {
3928                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3929                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3930                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3931                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3932                         }
3933                         else
3934                         {
3935                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3936                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3937                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3938                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3939                         }
3940                         buffer_FragColorbgra8[x*4+0] = d[0];
3941                         buffer_FragColorbgra8[x*4+1] = d[1];
3942                         buffer_FragColorbgra8[x*4+2] = d[2];
3943                         buffer_FragColorbgra8[x*4+3] = d[3];
3944                 }
3945         }
3946         else
3947         {
3948                 for (x = startx;x < endx;x++)
3949                 {
3950                         z = buffer_z[x];
3951                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3952                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3953                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3954                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3955
3956                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3957                         {
3958                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3959                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3960                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3961                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3962                         }
3963                         else
3964                         {
3965                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3966                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3967                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3968                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3969                         }
3970                         buffer_FragColorbgra8[x*4+0] = d[0];
3971                         buffer_FragColorbgra8[x*4+1] = d[1];
3972                         buffer_FragColorbgra8[x*4+2] = d[2];
3973                         buffer_FragColorbgra8[x*4+3] = d[3];
3974                 }
3975         }
3976         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3977 }
3978
3979
3980
3981 void DPSOFTRAST_VertexShader_LightSource(void)
3982 {
3983         int i;
3984         int numvertices = dpsoftrast.numvertices;
3985         float LightPosition[4];
3986         float LightVector[4];
3987         float LightVectorModelSpace[4];
3988         float EyePosition[4];
3989         float EyeVectorModelSpace[4];
3990         float EyeVector[4];
3991         float position[4];
3992         float svector[4];
3993         float tvector[4];
3994         float normal[4];
3995         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3996         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3997         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3998         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3999         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4000         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4001         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4002         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4003         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4004         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4005         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4006         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4007         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4008         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4009         for (i = 0;i < numvertices;i++)
4010         {
4011                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4012                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4013                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4014                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4015                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4016                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4017                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4018                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4019                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4020                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4021                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4022                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4023                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4024                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4025                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4026                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4027                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4028                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4029                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4030                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4031                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4032                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4033                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4034                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4035                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4036                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4037                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4038                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4039                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4040                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4041                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4042                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4043         }
4044         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4045         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4046 }
4047
4048 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4049 {
4050 #ifdef SSE_POSSIBLE
4051         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4052         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4053         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4054         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4055         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4056         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4057         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4058         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4059         int x, startx = span->startx, endx = span->endx;
4060         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4061         float CubeVectordata[4];
4062         float CubeVectorslope[4];
4063         float LightVectordata[4];
4064         float LightVectorslope[4];
4065         float EyeVectordata[4];
4066         float EyeVectorslope[4];
4067         float z;
4068         float diffusetex[4];
4069         float glosstex[4];
4070         float surfacenormal[4];
4071         float lightnormal[4];
4072         float eyenormal[4];
4073         float specularnormal[4];
4074         float diffuse;
4075         float specular;
4076         float SpecularPower;
4077         float CubeVector[4];
4078         float attenuation;
4079         int d[4];
4080         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4081         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4082         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4083         Color_Glow[3] = 0.0f;
4084         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4085         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4086         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4087         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4088         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4089         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4090         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4091         Color_Diffuse[3] = 0.0f;
4092         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4093         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4094         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4095         Color_Specular[3] = 0.0f;
4096         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4097         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4098         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4099         Color_Pants[3] = 0.0f;
4100         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4101         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4102         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4103         Color_Shirt[3] = 0.0f;
4104         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4105         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4106         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4107         LightColor[3] = 0.0f;
4108         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4109         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4110         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4111         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4112         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4113         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4114         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4115         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4116         {
4117                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4118                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4119         }
4120         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4121                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4122         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4123         {
4124                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4125                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4126                 for (x = startx;x < endx;x++)
4127                 {
4128                         z = buffer_z[x];
4129                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4130                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4131                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4132                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4133                         if (attenuation < 0.01f)
4134                                 continue;
4135                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4136                         {
4137                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4138                                 if (attenuation < 0.01f)
4139                                         continue;
4140                         }
4141
4142                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4143                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4144                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4145                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4146                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4147                         {
4148                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4149                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4150                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4151                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4152                         }
4153                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4154                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4155                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4156                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4157                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4158                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4159                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4160                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4161
4162                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4163                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4164                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4165                         DPSOFTRAST_Vector3Normalize(lightnormal);
4166
4167                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4168
4169                         if(thread->shader_exactspecularmath)
4170                         {
4171                                 // reflect lightnormal at surfacenormal, take the negative of that
4172                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4173                                 float f;
4174                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4175                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4176                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4177                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4178
4179                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4180                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4181                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4182                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4183                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4184
4185                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4186                         }
4187                         else
4188                         {
4189                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4190                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4191                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4192                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4193
4194                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4195                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4196                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4197                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4198
4199                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4200                         }
4201                         specular = pow(specular, SpecularPower * glosstex[3]);
4202
4203                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4204                         {
4205                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4206                                 attenuation *= (1.0f / 255.0f);
4207                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4208                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4209                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4210                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4211                         }
4212                         else
4213                         {
4214                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4215                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4216                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4217                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4218                         }
4219                         buffer_FragColorbgra8[x*4+0] = d[0];
4220                         buffer_FragColorbgra8[x*4+1] = d[1];
4221                         buffer_FragColorbgra8[x*4+2] = d[2];
4222                         buffer_FragColorbgra8[x*4+3] = d[3];
4223                 }
4224         }
4225         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4226         {
4227                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4228                 for (x = startx;x < endx;x++)
4229                 {
4230                         z = buffer_z[x];
4231                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4232                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4233                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4234                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4235                         if (attenuation < 0.01f)
4236                                 continue;
4237                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4238                         {
4239                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4240                                 if (attenuation < 0.01f)
4241                                         continue;
4242                         }
4243
4244                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4245                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4246                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4247                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4248                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4249                         {
4250                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4251                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4252                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4253                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4254                         }
4255                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4256                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4257                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4258                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4259
4260                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4261                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4262                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4263                         DPSOFTRAST_Vector3Normalize(lightnormal);
4264
4265                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4266                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4267                         {
4268                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4269                                 attenuation *= (1.0f / 255.0f);
4270                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4271                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4272                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4273                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4274                         }
4275                         else
4276                         {
4277                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4278                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4279                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4280                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4281                         }
4282                         buffer_FragColorbgra8[x*4+0] = d[0];
4283                         buffer_FragColorbgra8[x*4+1] = d[1];
4284                         buffer_FragColorbgra8[x*4+2] = d[2];
4285                         buffer_FragColorbgra8[x*4+3] = d[3];
4286                 }
4287         }
4288         else
4289         {
4290                 for (x = startx;x < endx;x++)
4291                 {
4292                         z = buffer_z[x];
4293                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4294                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4295                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4296                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4297                         if (attenuation < 0.01f)
4298                                 continue;
4299                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4300                         {
4301                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4302                                 if (attenuation < 0.01f)
4303                                         continue;
4304                         }
4305
4306                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4307                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4308                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4309                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4310                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4311                         {
4312                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4313                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4314                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4315                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4316                         }
4317                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4318                         {
4319                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4320                                 attenuation *= (1.0f / 255.0f);
4321                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4322                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4323                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4324                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4325                         }
4326                         else
4327                         {
4328                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4329                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4330                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4331                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4332                         }
4333                         buffer_FragColorbgra8[x*4+0] = d[0];
4334                         buffer_FragColorbgra8[x*4+1] = d[1];
4335                         buffer_FragColorbgra8[x*4+2] = d[2];
4336                         buffer_FragColorbgra8[x*4+3] = d[3];
4337                 }
4338         }
4339         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4340 #endif
4341 }
4342
4343
4344
4345 void DPSOFTRAST_VertexShader_Refraction(void)
4346 {
4347         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4348         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4349         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4350 }
4351
4352 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4353 {
4354         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4355
4356         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4357         float z;
4358         int x, startx = span->startx, endx = span->endx;
4359
4360         // texture reads
4361         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4362         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4363
4364         // varyings
4365         float ModelViewProjectionPositiondata[4];
4366         float ModelViewProjectionPositionslope[4];
4367
4368         // uniforms
4369         float ScreenScaleRefractReflect[2];
4370         float ScreenCenterRefractReflect[2];
4371         float DistortScaleRefractReflect[2];
4372         float RefractColor[4];
4373
4374         const unsigned char * RESTRICT pixelbase;
4375         const unsigned char * RESTRICT pixel[4];
4376         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4377         if(!texture) return;
4378         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4379
4380         // read textures
4381         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4382         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4383
4384         // read varyings
4385         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4386
4387         // read uniforms
4388         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4389         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4390         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4391         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4392         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4393         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4394         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4395         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4396         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4397         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4398
4399         // do stuff
4400         for (x = startx;x < endx;x++)
4401         {
4402                 float SafeScreenTexCoord[2];
4403                 float ScreenTexCoord[2];
4404                 float v[3];
4405                 float iw;
4406                 unsigned char c[4];
4407
4408                 z = buffer_z[x];
4409
4410                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4411                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4412                 
4413                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4414                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4415                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4416
4417                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4418                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4419                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4420                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4421                 DPSOFTRAST_Vector3Normalize(v);
4422                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4423                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4424
4425                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4426                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4427                 {
4428                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4429                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4430                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4431                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4432                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
4433                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4434                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4435                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4436                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4437                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4438                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4439                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4440                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4441                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4442                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4443                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4444                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4445                 }
4446                 else
4447                 {
4448                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4449                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4450                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4451                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4452                         c[0] = pixel[0][0];
4453                         c[1] = pixel[0][1];
4454                         c[2] = pixel[0][2];
4455                 }
4456
4457                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4458                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4459                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4460                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4461                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4462         }
4463
4464         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4465 }
4466
4467
4468
4469 void DPSOFTRAST_VertexShader_Water(void)
4470 {
4471         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4472 }
4473
4474
4475 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4476 {
4477         // TODO: IMPLEMENT
4478         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4479         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4480         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4481         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4482         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4483 }
4484
4485
4486
4487 void DPSOFTRAST_VertexShader_ShowDepth(void)
4488 {
4489         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4490 }
4491
4492 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4493 {
4494         // TODO: IMPLEMENT
4495         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4496         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4497         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4498         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4499         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4500 }
4501
4502
4503
4504 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4505 {
4506         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4507 }
4508
4509 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4510 {
4511         // TODO: IMPLEMENT
4512         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4513         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4514         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4515         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4516         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4517 }
4518
4519
4520
4521 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4522 {
4523         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4524 }
4525
4526 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4527 {
4528         // TODO: IMPLEMENT
4529         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4530         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4531         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4532         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4533         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4534 }
4535
4536
4537
4538 typedef struct DPSOFTRAST_ShaderModeInfo_s
4539 {
4540         int lodarrayindex;
4541         void (*Vertex)(void);
4542         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4543         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4544         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4545 }
4546 DPSOFTRAST_ShaderModeInfo;
4547
4548 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4549 {
4550         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4551         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4552         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4553         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4554         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4555         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4556         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4557         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4558         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4559         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4560         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4561         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4562         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4563         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4564         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4565         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4566 };
4567
4568 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4569 {
4570         int x;
4571         int startx;
4572         int endx;
4573         unsigned int *depthpixel;
4574         int depth;
4575         int depthslope;
4576         unsigned int d;
4577         unsigned char *pixelmask;
4578         DPSOFTRAST_State_Triangle *triangle;
4579         triangle = &thread->triangles[span->triangle];
4580         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4581         startx = span->startx;
4582         endx = span->endx;
4583         depth = span->depthbase;
4584         depthslope = span->depthslope;
4585         pixelmask = thread->pixelmaskarray;
4586         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4587         {
4588                 switch(thread->fb_depthfunc)
4589                 {
4590                 default:
4591                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4592                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4593                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4594                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4595                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4596                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4597                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4598                 }
4599                 while (startx < endx && !pixelmask[startx])
4600                         startx++;
4601                 while (endx > startx && !pixelmask[endx-1])
4602                         endx--;
4603         }
4604         else
4605         {
4606                 // no depth testing means we're just dealing with color...
4607                 memset(pixelmask + startx, 1, endx - startx);
4608         }
4609         span->pixelmask = pixelmask;
4610         span->startx = startx;
4611         span->endx = endx;
4612 }
4613
4614 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4615 {
4616         int x, d, depth, depthslope, startx, endx;
4617         const unsigned char *pixelmask;
4618         unsigned int *depthpixel;
4619         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4620         {
4621                 depth = span->depthbase;
4622                 depthslope = span->depthslope;
4623                 pixelmask = span->pixelmask;
4624                 startx = span->startx;
4625                 endx = span->endx;
4626                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4627                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4628                         if (pixelmask[x])
4629                                 depthpixel[x] = d;
4630         }
4631 }
4632
4633 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4634 {
4635         int i;
4636         DPSOFTRAST_State_Triangle *triangle;
4637         DPSOFTRAST_State_Span *span;
4638         for (i = 0; i < thread->numspans; i++)
4639         {
4640                 span = &thread->spans[i];
4641                 triangle = &thread->triangles[span->triangle];
4642                 DPSOFTRAST_Draw_DepthTest(thread, span);
4643                 if (span->startx >= span->endx)
4644                         continue;
4645                 // run pixel shader if appropriate
4646                 // do this before running depthmask code, to allow the pixelshader
4647                 // to clear pixelmask values for alpha testing
4648                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4649                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4650                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4651         }
4652         thread->numspans = 0;
4653 }
4654
4655 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4656
4657 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4658 {
4659 #ifdef SSE_POSSIBLE
4660         int cullface = thread->cullface;
4661         int minx, maxx, miny, maxy;
4662         int miny1, maxy1, miny2, maxy2;
4663         __m128i fbmin, fbmax;
4664         __m128 viewportcenter, viewportscale;
4665         int firstvertex = command->firstvertex;
4666         int numvertices = command->numvertices;
4667         int numtriangles = command->numtriangles;
4668         const int *element3i = command->element3i;
4669         const unsigned short *element3s = command->element3s;
4670         int clipped = command->clipped;
4671         int i;
4672         int j;
4673         int k;
4674         int y;
4675         int e[3];
4676         __m128i screeny;
4677         int starty, endy, bandy;
4678         int numpoints;
4679         int clipcase;
4680         float clipdist[4];
4681         float clip0origin, clip0slope;
4682         int clip0dir;
4683         __m128 triangleedge1, triangleedge2, trianglenormal;
4684         __m128 clipfrac[3];
4685         __m128 screen[4];
4686         DPSOFTRAST_State_Triangle *triangle;
4687         DPSOFTRAST_Texture *texture;
4688         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4689         miny = thread->fb_scissor[1];
4690         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4691         miny1 = bound(miny, thread->miny1, maxy);
4692         maxy1 = bound(miny, thread->maxy1, maxy);
4693         miny2 = bound(miny, thread->miny2, maxy);
4694         maxy2 = bound(miny, thread->maxy2, maxy);
4695         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4696         {
4697                 if (!ATOMIC_DECREMENT(command->refcount))
4698                 {
4699                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4700                                 MM_FREE(command->arrays);
4701                 }
4702                 return;
4703         }
4704         minx = thread->fb_scissor[0];
4705         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4706         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4707         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4708         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4709         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4710         screen[3] = _mm_setzero_ps();
4711         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4712         for (i = 0;i < numtriangles;i++)
4713         {
4714                 const float *screencoord4f = command->arrays;
4715                 const float *arrays = screencoord4f + numvertices*4;
4716
4717                 // generate the 3 edges of this triangle
4718                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4719                 if (element3s)
4720                 {
4721                         e[0] = element3s[i*3+0] - firstvertex;
4722                         e[1] = element3s[i*3+1] - firstvertex;
4723                         e[2] = element3s[i*3+2] - firstvertex;
4724                 }
4725                 else if (element3i)
4726                 {
4727                         e[0] = element3i[i*3+0] - firstvertex;
4728                         e[1] = element3i[i*3+1] - firstvertex;
4729                         e[2] = element3i[i*3+2] - firstvertex;
4730                 }
4731                 else
4732                 {
4733                         e[0] = i*3+0;
4734                         e[1] = i*3+1;
4735                         e[2] = i*3+2;
4736                 }
4737
4738 #define SKIPBACKFACE \
4739                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4740                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4741                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4742                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4743                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4744                 switch(cullface) \
4745                 { \
4746                 case GL_BACK: \
4747                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4748                                 continue; \
4749                         break; \
4750                 case GL_FRONT: \
4751                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4752                                 continue; \
4753                         break; \
4754                 }
4755
4756 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4757                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4758                         { \
4759                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4760                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4761                         }
4762 #define CLIPPEDVERTEXCOPY(k,p1) \
4763                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4764
4765 #define GENATTRIBCOPY(attrib, p1) \
4766                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4767 #define GENATTRIBLERP(attrib, p1, p2) \
4768                 { \
4769                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4770                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4771                 }
4772 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4773                 switch(clipcase) \
4774                 { \
4775                 default: \
4776                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4777                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4778                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4779                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4780                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4781                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4782                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4783                 }
4784
4785                 if (! clipped)
4786                         goto notclipped;
4787
4788                 // calculate distance from nearplane
4789                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4790                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4791                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4792                 if (clipdist[0] >= 0.0f)
4793                 {
4794                         if (clipdist[1] >= 0.0f)
4795                         {
4796                                 if (clipdist[2] >= 0.0f)
4797                                 {
4798                                 notclipped:
4799                                         // triangle is entirely in front of nearplane
4800                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4801                                         SKIPBACKFACE;
4802                                         numpoints = 3;
4803                                         clipcase = 0;
4804                                 }
4805                                 else
4806                                 {
4807                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4808                                         SKIPBACKFACE;
4809                                         numpoints = 4;
4810                                         clipcase = 1;
4811                                 }
4812                         }
4813                         else
4814                         {
4815                                 if (clipdist[2] >= 0.0f)
4816                                 {
4817                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4818                                         SKIPBACKFACE;
4819                                         numpoints = 4;
4820                                         clipcase = 2;
4821                                 }
4822                                 else
4823                                 {
4824                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4825                                         SKIPBACKFACE;
4826                                         numpoints = 3;
4827                                         clipcase = 3;
4828                                 }
4829                         }
4830                 }
4831                 else if (clipdist[1] >= 0.0f)
4832                 {
4833                         if (clipdist[2] >= 0.0f)
4834                         {
4835                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4836                                 SKIPBACKFACE;
4837                                 numpoints = 4;
4838                                 clipcase = 4;
4839                         }
4840                         else
4841                         {
4842                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4843                                 SKIPBACKFACE;
4844                                 numpoints = 3;
4845                                 clipcase = 5;
4846                         }
4847                 }
4848                 else if (clipdist[2] >= 0.0f)
4849                 {
4850                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4851                         SKIPBACKFACE;
4852                         numpoints = 3;
4853                         clipcase = 6;
4854                 }
4855                 else continue; // triangle is entirely behind nearplane
4856
4857                 {
4858                         // calculate integer y coords for triangle points
4859                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4860                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4861                                         screenmin = _mm_min_epi16(screeni, screenir),
4862                                         screenmax = _mm_max_epi16(screeni, screenir);
4863                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4864                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4865                         screenmin = _mm_max_epi16(screenmin, fbmin);
4866                         screenmax = _mm_min_epi16(screenmax, fbmax);
4867                         // skip offscreen triangles
4868                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4869                                 continue;
4870                         starty = _mm_extract_epi16(screenmin, 1);
4871                         endy = _mm_extract_epi16(screenmax, 1)+1;
4872                         if (starty >= maxy1 && endy <= miny2)
4873                                 continue;
4874                         screeny = _mm_srai_epi32(screeni, 16);
4875                 }
4876
4877                 triangle = &thread->triangles[thread->numtriangles];
4878
4879                 // calculate attribute plans for triangle data...
4880                 // okay, this triangle is going to produce spans, we'd better project
4881                 // the interpolants now (this is what gives perspective texturing),
4882                 // this consists of simply multiplying all arrays by the W coord
4883                 // (which is basically 1/Z), which will be undone per-pixel
4884                 // (multiplying by Z again) to get the perspective-correct array
4885                 // values
4886                 {
4887                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4888                         __m128 mipedgescale, mipdensity;
4889                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4890                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4891                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4892                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4893                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4894                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4895                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4896                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4897                         attribedge1 = _mm_sub_ss(w0, w1);
4898                         attribedge2 = _mm_sub_ss(w2, w1);
4899                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4900                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4901                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4902                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4903                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4904                         _mm_store_ss(&triangle->w[0], attribxslope);
4905                         _mm_store_ss(&triangle->w[1], attribyslope);
4906                         _mm_store_ss(&triangle->w[2], attriborigin);
4907                         
4908                         clip0origin = 0;
4909                         clip0slope = 0;
4910                         clip0dir = 0;
4911                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
4912                         {
4913                                 float cliporigin, clipxslope, clipyslope;
4914                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
4915                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4916                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4917                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4918                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4919                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4920                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
4921                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
4922                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
4923                                 if(clipxslope != 0)
4924                                 {
4925                                         clip0origin = -cliporigin/clipxslope;
4926                                         clip0slope = -clipyslope/clipxslope;
4927                                         clip0dir = clipxslope > 0 ? 1 : -1;
4928                                 }
4929                                 else if(clipyslope > 0)
4930                                 {
4931                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
4932                                         clip0slope = dpsoftrast.fb_width;
4933                                         clip0dir = -1;
4934                                 }
4935                                 else if(clipyslope < 0)
4936                                 {
4937                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
4938                                         clip0slope = -dpsoftrast.fb_width;
4939                                         clip0dir = -1;
4940                                 }
4941                                 else if(clip0origin < 0) continue;
4942                         }
4943
4944                         mipedgescale = _mm_setzero_ps();
4945                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4946                         {
4947                                 __m128 attrib0, attrib1, attrib2;
4948                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4949                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4950                                         break;
4951                                 arrays += numvertices*4;
4952                                 GENATTRIBS(attrib0, attrib1, attrib2);
4953                                 attriborigin = _mm_mul_ps(attrib1, w1);
4954                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4955                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4956                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4957                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4958                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4959                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4960                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4961                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4962                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4963                                 {
4964                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4965                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4966                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4967                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4968                                 }
4969                         }
4970
4971                         memset(triangle->mip, 0, sizeof(triangle->mip));
4972                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4973                         {
4974                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4975                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4976                                         break;
4977                                 texture = thread->texbound[texunit];
4978                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4979                                 {
4980                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4981                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4982                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4983                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4984                                         // this will be multiplied in the texturing routine by the texture resolution
4985                                         y = _mm_cvtss_si32(mipdensity);
4986                                         if (y > 0)
4987                                         {
4988                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4989                                                 if (y > texture->mipmaps - 1)
4990                                                         y = texture->mipmaps - 1;
4991                                                 triangle->mip[texunit] = y;
4992                                         }
4993                                 }
4994                         }
4995                 }
4996         
4997                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4998                 for (; y < bandy;)
4999                 {
5000                         __m128 xcoords, xslope;
5001                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5002                         int yccmask = _mm_movemask_epi8(ycc);
5003                         int edge0p, edge0n, edge1p, edge1n;
5004                         int nexty;
5005                         float w, wslope;
5006                         float clip0;
5007                         if (numpoints == 4)
5008                         {
5009                                 switch(yccmask)
5010                                 {
5011                                 default:
5012                                 case 0xFFFF: /*0000*/ y = endy; continue;
5013                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5014                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5015                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5016                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5017                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5018                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5019                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5020                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5021                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5022                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5023                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5024                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5025                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5026                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5027                                 case 0x0000: /*1111*/ y++; continue;
5028                                 }
5029                         }
5030                         else
5031                         {
5032                                 switch(yccmask)
5033                                 {
5034                                 default:
5035                                 case 0xFFFF: /*000*/ y = endy; continue;
5036                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5037                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5038                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5039                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5040                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5041                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5042                                 case 0x0000: /*111*/ y++; continue;
5043                                 }
5044                         }
5045                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5046                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5047                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5048                         nexty = _mm_extract_epi16(ycc, 0);
5049                         if (nexty >= bandy) nexty = bandy-1;
5050                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5051                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5052                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5053                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5054                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5055                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5056                         {
5057                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5058                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5059                         }
5060                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5061                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5062                         {
5063                                 int startx, endx, offset;
5064                                 startx = _mm_cvtss_si32(xcoords);
5065                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5066                                 if (startx < minx) startx = minx;
5067                                 if (endx > maxx) endx = maxx;
5068                                 if (startx >= endx) continue;
5069
5070                                 if (clip0dir)
5071                                 {
5072                                         if (clip0dir > 0)
5073                                         {
5074                                                 if (startx < clip0) 
5075                                                 {
5076                                                         if(endx <= clip0) continue;
5077                                                         startx = (int)clip0;
5078                                                 }
5079                                         }
5080                                         else if (endx > clip0) 
5081                                         {
5082                                                 if(startx >= clip0) continue;
5083                                                 endx = (int)clip0;
5084                                         }
5085                                 }
5086                                                 
5087                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5088                                 {
5089                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5090                                         span->triangle = thread->numtriangles;
5091                                         span->x = offset;
5092                                         span->y = y;
5093                                         span->startx = 0;
5094                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5095                                         if (span->startx >= span->endx)
5096                                                 continue;
5097                                         wslope = triangle->w[0];
5098                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5099                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5100                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5101                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5102                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5103                                 }
5104                         }
5105                 }
5106
5107                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5108                 {
5109                         DPSOFTRAST_Draw_ProcessSpans(thread);
5110                         thread->numtriangles = 0;
5111                 }
5112         }
5113
5114         if (!ATOMIC_DECREMENT(command->refcount))
5115         {
5116                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5117                         MM_FREE(command->arrays);
5118         }
5119
5120         if (thread->numspans > 0 || thread->numtriangles > 0)
5121         {
5122                 DPSOFTRAST_Draw_ProcessSpans(thread);
5123                 thread->numtriangles = 0;
5124         }
5125 #endif
5126 }
5127
5128 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5129 {
5130         int i;
5131         int j;
5132         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5133         int datasize = 2*numvertices*sizeof(float[4]);
5134         DPSOFTRAST_Command_Draw *command;
5135         unsigned char *data;
5136         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5137         {
5138                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5139                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5140                         break;
5141                 datasize += numvertices*sizeof(float[4]);
5142         }
5143         if (element3s)
5144                 datasize += numtriangles*sizeof(unsigned short[3]);
5145         else if (element3i)
5146                 datasize += numtriangles*sizeof(int[3]);
5147         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5148         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5149         {
5150                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5151                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5152         }
5153         else
5154         {
5155                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5156                 data = (unsigned char *)command + commandsize;
5157         }
5158         command->firstvertex = firstvertex;
5159         command->numvertices = numvertices;
5160         command->numtriangles = numtriangles;
5161         command->arrays = (float *)data;
5162         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5163         dpsoftrast.firstvertex = firstvertex;
5164         dpsoftrast.numvertices = numvertices;
5165         dpsoftrast.screencoord4f = (float *)data;
5166         data += numvertices*sizeof(float[4]);
5167         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5168         data += numvertices*sizeof(float[4]);
5169         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5170         {
5171                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5172                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5173                         break;
5174                 dpsoftrast.post_array4f[j] = (float *)data;
5175                 data += numvertices*sizeof(float[4]);
5176         }
5177         command->element3i = NULL;
5178         command->element3s = NULL;
5179         if (element3s)
5180         {
5181                 command->element3s = (unsigned short *)data;
5182                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5183         }
5184         else if (element3i)
5185         {
5186                 command->element3i = (int *)data;
5187                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5188         }
5189         return command;
5190 }
5191
5192 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5193 {
5194         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5195         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5196         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5197         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5198         if (command->starty >= command->endy)
5199         {
5200                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5201                         MM_FREE(command->arrays);
5202                 DPSOFTRAST_UndoCommand(command->commandsize);
5203                 return;
5204         }
5205         command->clipped = dpsoftrast.drawclipped;
5206         command->refcount = dpsoftrast.numthreads;
5207
5208         if (dpsoftrast.usethreads)
5209         {
5210                 int i;
5211                 DPSOFTRAST_Draw_SyncCommands();
5212                 for (i = 0; i < dpsoftrast.numthreads; i++)
5213                 {
5214                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5215                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5216                                 Thread_CondSignal(thread->drawcond);
5217                 }
5218         }
5219         else
5220         {
5221                 DPSOFTRAST_Draw_FlushThreads();
5222         }
5223 }
5224
5225 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5226 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5227 {
5228         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5229 }
5230 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5231 {
5232         DPSOFTRAST_Command_SetRenderTargets *command;
5233         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5234                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5235                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5236                 DPSOFTRAST_Flush();
5237         dpsoftrast.fb_width = width;
5238         dpsoftrast.fb_height = height;
5239         dpsoftrast.fb_depthpixels = depthpixels;
5240         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5241         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5242         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5243         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5244         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5245         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5246         command->width = width;
5247         command->height = height;
5248 }
5249  
5250 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5251 {
5252         int commandoffset = thread->commandoffset;
5253         while (commandoffset != endoffset)
5254         {
5255                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5256                 switch (command->opcode)
5257                 {
5258 #define INTERPCOMMAND(name) \
5259                 case DPSOFTRAST_OPCODE_##name : \
5260                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5261                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5262                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5263                                 commandoffset = 0; \
5264                         break;
5265                 INTERPCOMMAND(Viewport)
5266                 INTERPCOMMAND(ClearColor)
5267                 INTERPCOMMAND(ClearDepth)
5268                 INTERPCOMMAND(ColorMask)
5269                 INTERPCOMMAND(DepthTest)
5270                 INTERPCOMMAND(ScissorTest)
5271                 INTERPCOMMAND(Scissor)
5272                 INTERPCOMMAND(BlendFunc)
5273                 INTERPCOMMAND(BlendSubtract)
5274                 INTERPCOMMAND(DepthMask)
5275                 INTERPCOMMAND(DepthFunc)
5276                 INTERPCOMMAND(DepthRange)
5277                 INTERPCOMMAND(PolygonOffset)
5278                 INTERPCOMMAND(CullFace)
5279                 INTERPCOMMAND(AlphaTest)
5280                 INTERPCOMMAND(AlphaFunc)
5281                 INTERPCOMMAND(SetTexture)
5282                 INTERPCOMMAND(SetShader)
5283                 INTERPCOMMAND(Uniform4f)
5284                 INTERPCOMMAND(UniformMatrix4f)
5285                 INTERPCOMMAND(Uniform1i)
5286                 INTERPCOMMAND(SetRenderTargets)
5287                 INTERPCOMMAND(ClipPlane)
5288
5289                 case DPSOFTRAST_OPCODE_Draw:
5290                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5291                         commandoffset += command->commandsize;
5292                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5293                                 commandoffset = 0;
5294                         thread->commandoffset = commandoffset;
5295                         break;
5296
5297                 case DPSOFTRAST_OPCODE_Reset:
5298                         commandoffset = 0;
5299                         break;
5300                 }
5301         }
5302         thread->commandoffset = commandoffset;
5303 }
5304
5305 static int DPSOFTRAST_Draw_Thread(void *data)
5306 {
5307         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5308         while(thread->index >= 0)
5309         {
5310                 if (thread->commandoffset != dpsoftrast.drawcommand)
5311                 {
5312                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5313                 }
5314                 else 
5315                 {
5316                         Thread_LockMutex(thread->drawmutex);
5317                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5318                         {
5319                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5320                                 thread->starving = true;
5321                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5322                                 thread->starving = false;
5323                         }
5324                         Thread_UnlockMutex(thread->drawmutex);
5325                 }
5326         }   
5327         return 0;
5328 }
5329
5330 static void DPSOFTRAST_Draw_FlushThreads(void)
5331 {
5332         DPSOFTRAST_State_Thread *thread;
5333         int i;
5334         DPSOFTRAST_Draw_SyncCommands();
5335         if (dpsoftrast.usethreads) 
5336         {
5337                 for (i = 0; i < dpsoftrast.numthreads; i++)
5338                 {
5339                         thread = &dpsoftrast.threads[i];
5340                         if (thread->commandoffset != dpsoftrast.drawcommand)
5341                         {
5342                                 Thread_LockMutex(thread->drawmutex);
5343                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5344                                         Thread_CondSignal(thread->drawcond);
5345                                 Thread_UnlockMutex(thread->drawmutex);
5346                         }
5347                 }
5348                 for (i = 0; i < dpsoftrast.numthreads; i++)
5349                 {
5350                         thread = &dpsoftrast.threads[i];
5351                         if (thread->commandoffset != dpsoftrast.drawcommand)
5352                         {
5353                                 Thread_LockMutex(thread->drawmutex);
5354                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5355                                 {
5356                                         thread->waiting = true;
5357                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5358                                         thread->waiting = false;
5359                                 }
5360                                 Thread_UnlockMutex(thread->drawmutex);
5361                         }
5362                 }
5363         }
5364         else
5365         {
5366                 for (i = 0; i < dpsoftrast.numthreads; i++)
5367                 {
5368                         thread = &dpsoftrast.threads[i];
5369                         if (thread->commandoffset != dpsoftrast.drawcommand)
5370                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5371                 }
5372         }
5373         dpsoftrast.commandpool.usedcommands = 0;
5374 }
5375
5376 void DPSOFTRAST_Flush(void)
5377 {
5378         DPSOFTRAST_Draw_FlushThreads();
5379 }
5380
5381 void DPSOFTRAST_Finish(void)
5382 {
5383         DPSOFTRAST_Flush();
5384 }
5385
5386 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5387 {
5388         int i;
5389         union
5390         {
5391                 int i;
5392                 unsigned char b[4];
5393         }
5394         u;
5395         u.i = 1;
5396         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5397         dpsoftrast.bigendian = u.b[3];
5398         dpsoftrast.fb_width = width;
5399         dpsoftrast.fb_height = height;
5400         dpsoftrast.fb_depthpixels = depthpixels;
5401         dpsoftrast.fb_colorpixels[0] = colorpixels;
5402         dpsoftrast.fb_colorpixels[1] = NULL;
5403         dpsoftrast.fb_colorpixels[1] = NULL;
5404         dpsoftrast.fb_colorpixels[1] = NULL;
5405         dpsoftrast.viewport[0] = 0;
5406         dpsoftrast.viewport[1] = 0;
5407         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5408         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5409         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5410         dpsoftrast.texture_firstfree = 1;
5411         dpsoftrast.texture_end = 1;
5412         dpsoftrast.texture_max = 0;
5413         dpsoftrast.color[0] = 1;
5414         dpsoftrast.color[1] = 1;
5415         dpsoftrast.color[2] = 1;
5416         dpsoftrast.color[3] = 1;
5417         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5418         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5419         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5420         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5421         for (i = 0; i < dpsoftrast.numthreads; i++)
5422         {
5423                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5424                 thread->index = i;
5425                 thread->cullface = GL_BACK;
5426         thread->colormask[0] = 1; 
5427                 thread->colormask[1] = 1;
5428                 thread->colormask[2] = 1;
5429                 thread->colormask[3] = 1;
5430                 thread->blendfunc[0] = GL_ONE;
5431                 thread->blendfunc[1] = GL_ZERO;
5432                 thread->depthmask = true;
5433                 thread->depthtest = true;
5434                 thread->depthfunc = GL_LEQUAL;
5435                 thread->scissortest = false;
5436                 thread->alphatest = false;
5437                 thread->alphafunc = GL_GREATER;
5438                 thread->alphavalue = 0.5f;
5439                 thread->viewport[0] = 0;
5440                 thread->viewport[1] = 0;
5441                 thread->viewport[2] = dpsoftrast.fb_width;
5442                 thread->viewport[3] = dpsoftrast.fb_height;
5443                 thread->scissor[0] = 0;
5444                 thread->scissor[1] = 0;
5445                 thread->scissor[2] = dpsoftrast.fb_width;
5446                 thread->scissor[3] = dpsoftrast.fb_height;
5447                 thread->depthrange[0] = 0;
5448                 thread->depthrange[1] = 1;
5449                 thread->polygonoffset[0] = 0;
5450                 thread->polygonoffset[1] = 0;
5451                 thread->clipplane[0] = 0;
5452                 thread->clipplane[1] = 0;
5453                 thread->clipplane[2] = 0;
5454                 thread->clipplane[3] = 1;
5455         
5456                 thread->numspans = 0;
5457                 thread->numtriangles = 0;
5458                 thread->commandoffset = 0;
5459                 thread->waiting = false;
5460                 thread->starving = false;
5461            
5462                 thread->validate = -1;
5463                 DPSOFTRAST_Validate(thread, -1);
5464  
5465                 if (dpsoftrast.usethreads)
5466                 {
5467                         thread->waitcond = Thread_CreateCond();
5468                         thread->drawcond = Thread_CreateCond();
5469                         thread->drawmutex = Thread_CreateMutex();
5470                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5471                 }
5472         }
5473         return 0;
5474 }
5475
5476 void DPSOFTRAST_Shutdown(void)
5477 {
5478         int i;
5479         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5480         {
5481                 DPSOFTRAST_State_Thread *thread;
5482                 for (i = 0; i < dpsoftrast.numthreads; i++)
5483                 {
5484                         thread = &dpsoftrast.threads[i];
5485                         Thread_LockMutex(thread->drawmutex);
5486                         thread->index = -1;
5487                         Thread_CondSignal(thread->drawcond);
5488                         Thread_UnlockMutex(thread->drawmutex);
5489                         Thread_WaitThread(thread->thread, 0);
5490                         Thread_DestroyCond(thread->waitcond);
5491                         Thread_DestroyCond(thread->drawcond);
5492                         Thread_DestroyMutex(thread->drawmutex);
5493                 }
5494         }
5495         for (i = 0;i < dpsoftrast.texture_end;i++)
5496                 if (dpsoftrast.texture[i].bytes)
5497                         MM_FREE(dpsoftrast.texture[i].bytes);
5498         if (dpsoftrast.texture)
5499                 free(dpsoftrast.texture);
5500         if (dpsoftrast.threads)
5501                 MM_FREE(dpsoftrast.threads);
5502         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5503 }
5504