]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
d72c52b2e6e150def18f27803437fe97d98f98b6
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6)
77         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
78 #endif
79
80 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
81
82 static void *MM_CALLOC(size_t nmemb, size_t size)
83 {
84         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
85         if (ptr != NULL) memset(ptr, 0, nmemb*size);
86         return ptr;
87 }
88
89 #define MM_FREE _mm_free
90 #else
91 #define MM_MALLOC(size) malloc(size)
92 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
93 #define MM_FREE free
94 #endif
95
96 typedef enum DPSOFTRAST_ARRAY_e
97 {
98         DPSOFTRAST_ARRAY_POSITION,
99         DPSOFTRAST_ARRAY_COLOR,
100         DPSOFTRAST_ARRAY_TEXCOORD0,
101         DPSOFTRAST_ARRAY_TEXCOORD1,
102         DPSOFTRAST_ARRAY_TEXCOORD2,
103         DPSOFTRAST_ARRAY_TEXCOORD3,
104         DPSOFTRAST_ARRAY_TEXCOORD4,
105         DPSOFTRAST_ARRAY_TEXCOORD5,
106         DPSOFTRAST_ARRAY_TEXCOORD6,
107         DPSOFTRAST_ARRAY_TEXCOORD7,
108         DPSOFTRAST_ARRAY_TOTAL
109 }
110 DPSOFTRAST_ARRAY;
111
112 typedef struct DPSOFTRAST_Texture_s
113 {
114         int flags;
115         int width;
116         int height;
117         int depth;
118         int sides;
119         DPSOFTRAST_TEXTURE_FILTER filter;
120         int mipmaps;
121         int size;
122         ATOMIC_COUNTER binds;
123         unsigned char *bytes;
124         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
125 }
126 DPSOFTRAST_Texture;
127
128 #define COMMAND_SIZE ALIGN_SIZE
129 #define COMMAND_ALIGN(var) ALIGN(var)
130
131 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
132 {
133         unsigned char opcode;
134         unsigned short commandsize;
135 }
136 DPSOFTRAST_Command);
137
138 enum { DPSOFTRAST_OPCODE_Reset = 0 };
139
140 #define DEFCOMMAND(opcodeval, name, fields) \
141         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
142         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
143         { \
144                 unsigned char opcode; \
145                 unsigned short commandsize; \
146                 fields \
147         } DPSOFTRAST_Command_##name );
148
149 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
150 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
151
152 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
153 {
154         int freecommand;
155         int usedcommands;
156         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
157 }
158 DPSOFTRAST_State_Command_Pool);
159
160 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
161 {
162         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
163         float w[3];
164         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
165 }
166 DPSOFTRAST_State_Triangle);
167
168 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
169         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
170         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
171                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
172                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
173 }
174 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
175         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
176         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
177         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
178         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
179         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
180         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
181         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
182         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
183 }
184                                         
185 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
186
187 typedef ALIGN(struct DPSOFTRAST_State_Span_s
188 {
189         int triangle; // triangle this span was generated by
190         int x; // framebuffer x coord
191         int y; // framebuffer y coord
192         int startx; // usable range (according to pixelmask)
193         int endx; // usable range (according to pixelmask)
194         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
195         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
196         int depthslope; // depthbuffer value pixel delta
197 }
198 DPSOFTRAST_State_Span);
199
200 #define DPSOFTRAST_DRAW_MAXSPANS 1024
201 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
202 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
203
204 #define DPSOFTRAST_VALIDATE_FB 1
205 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
206 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
207 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
208
209 typedef enum DPSOFTRAST_BLENDMODE_e
210 {
211         DPSOFTRAST_BLENDMODE_OPAQUE,
212         DPSOFTRAST_BLENDMODE_ALPHA,
213         DPSOFTRAST_BLENDMODE_ADDALPHA,
214         DPSOFTRAST_BLENDMODE_ADD,
215         DPSOFTRAST_BLENDMODE_INVMOD,
216         DPSOFTRAST_BLENDMODE_MUL,
217         DPSOFTRAST_BLENDMODE_MUL2,
218         DPSOFTRAST_BLENDMODE_SUBALPHA,
219         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
220         DPSOFTRAST_BLENDMODE_INVADD,
221         DPSOFTRAST_BLENDMODE_TOTAL
222 }
223 DPSOFTRAST_BLENDMODE;
224
225 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
226 {
227         void *thread;
228         int index;
229         
230         int cullface;
231         int colormask[4];
232         int blendfunc[2];
233         int blendsubtract;
234         int depthmask;
235         int depthtest;
236         int depthfunc;
237         int scissortest;
238         int alphatest;
239         int alphafunc;
240         float alphavalue;
241         int viewport[4];
242         int scissor[4];
243         float depthrange[2];
244         float polygonoffset[2];
245         float clipplane[4];
246         ALIGN(float fb_clipplane[4]);
247
248         int shader_mode;
249         int shader_permutation;
250         int shader_exactspecularmath;
251
252         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
253         
254         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
255         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
256
257         // DPSOFTRAST_VALIDATE_ flags
258         int validate;
259
260         // derived values (DPSOFTRAST_VALIDATE_FB)
261         int fb_colormask;
262         int fb_scissor[4];
263         ALIGN(float fb_viewportcenter[4]);
264         ALIGN(float fb_viewportscale[4]);
265
266         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
267         int fb_depthfunc;
268
269         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
270         int fb_blendmode;
271
272         // band boundaries
273         int miny1;
274         int maxy1;
275         int miny2;
276         int maxy2;
277
278         ATOMIC(volatile int commandoffset);
279
280         volatile bool waiting;
281         volatile bool starving;
282         void *waitcond;
283         void *drawcond;
284         void *drawmutex;
285
286         int numspans;
287         int numtriangles;
288         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
289         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
290         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
291 }
292 DPSOFTRAST_State_Thread);
293
294 typedef ATOMIC(struct DPSOFTRAST_State_s
295 {
296         int fb_width;
297         int fb_height;
298         unsigned int *fb_depthpixels;
299         unsigned int *fb_colorpixels[4];
300
301         int viewport[4];
302         ALIGN(float fb_viewportcenter[4]);
303         ALIGN(float fb_viewportscale[4]);
304
305         float color[4];
306         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
307         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
308
309         const float *pointer_vertex3f;
310         const float *pointer_color4f;
311         const unsigned char *pointer_color4ub;
312         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
313         int stride_vertex;
314         int stride_color;
315         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
316         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
317         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
318
319         int firstvertex;
320         int numvertices;
321         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
322         float *screencoord4f;
323         int drawstarty;
324         int drawendy;
325         int drawclipped;
326         
327         int shader_mode;
328         int shader_permutation;
329         int shader_exactspecularmath;
330
331         int texture_max;
332         int texture_end;
333         int texture_firstfree;
334         DPSOFTRAST_Texture *texture;
335
336         int bigendian;
337
338         // error reporting
339         const char *errorstring;
340
341         bool usethreads;
342         int interlace;
343         int numthreads;
344         DPSOFTRAST_State_Thread *threads;
345
346         ATOMIC(volatile int drawcommand);
347
348         DPSOFTRAST_State_Command_Pool commandpool;
349 }
350 DPSOFTRAST_State);
351
352 DPSOFTRAST_State dpsoftrast;
353
354 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
355 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
356 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
357 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
358
359 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
360 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
361
362 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
363 {
364         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
365         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
366         fb_viewportcenter[3] = 0.5f;
367         fb_viewportcenter[0] = 0.0f;
368         fb_viewportscale[1] = 0.5f * viewport[2];
369         fb_viewportscale[2] = -0.5f * viewport[3];
370         fb_viewportscale[3] = 0.5f;
371         fb_viewportscale[0] = 1.0f;
372 }
373
374 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
375 {
376         if (dpsoftrast.interlace)
377         {
378                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
379                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
380                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
381                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
382         }
383         else
384         {
385                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
386                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
387         }
388 }
389
390 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
391 {
392         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
393         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
394         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
395         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
396         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
397 }
398
399 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
400 {
401         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
402         // and viewport projection values
403         int x1, x2;
404         int y1, y2;
405         x1 = thread->scissor[0];
406         x2 = thread->scissor[0] + thread->scissor[2];
407         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
408         y2 = dpsoftrast.fb_height - thread->scissor[1];
409         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
410         if (x1 < 0) x1 = 0;
411         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
412         if (y1 < 0) y1 = 0;
413         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
414         thread->fb_scissor[0] = x1;
415         thread->fb_scissor[1] = y1;
416         thread->fb_scissor[2] = x2 - x1;
417         thread->fb_scissor[3] = y2 - y1;
418
419         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
420         DPSOFTRAST_RecalcClipPlane(thread);
421         DPSOFTRAST_RecalcThread(thread);
422 }
423
424 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
425 {
426         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
427 }
428
429 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
430 {
431         if (thread->blendsubtract)
432         {
433                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
434                 {
435                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
436                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
437                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
438                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
439                 }
440         }
441         else
442         {       
443                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
444                 {
445                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
446                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
447                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
448                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
449                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
450                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
451                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
452                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
453                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
454                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
455                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
456                 }
457         }
458 }
459
460 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
461
462 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
463 {
464         mask &= thread->validate;
465         if (!mask)
466                 return;
467         if (mask & DPSOFTRAST_VALIDATE_FB)
468         {
469                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
470                 DPSOFTRAST_RecalcFB(thread);
471         }
472         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
473         {
474                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
475                 DPSOFTRAST_RecalcDepthFunc(thread);
476         }
477         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
478         {
479                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
480                 DPSOFTRAST_RecalcBlendFunc(thread);
481         }
482 }
483
484 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
485 {
486         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
487                 return &dpsoftrast.texture[index];
488         return NULL;
489 }
490
491 static void DPSOFTRAST_Texture_Grow(void)
492 {
493         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
494         DPSOFTRAST_State_Thread *thread;
495         int i;
496         int j;
497         DPSOFTRAST_Flush();
498         // expand texture array as needed
499         if (dpsoftrast.texture_max < 1024)
500                 dpsoftrast.texture_max = 1024;
501         else
502                 dpsoftrast.texture_max *= 2;
503         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
504         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
505                 if (dpsoftrast.texbound[i])
506                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
507         for (j = 0; j < dpsoftrast.numthreads; j++)
508         {
509                 thread = &dpsoftrast.threads[j];
510                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
511                         if (thread->texbound[i])
512                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
513         }
514 }
515
516 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
517 {
518         int w;
519         int h;
520         int d;
521         int size;
522         int s;
523         int texnum;
524         int mipmaps;
525         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
526         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
527         DPSOFTRAST_Texture *texture;
528         if (width*height*depth < 1)
529         {
530                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
531                 return 0;
532         }
533         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
534         {
535                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
536                 return 0;
537         }
538         switch(texformat)
539         {
540         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
541         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
542         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
543                 break;
544         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
545                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
546                 {
547                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
548                         return 0;
549                 }
550                 if (depth != 1)
551                 {
552                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
553                         return 0;
554                 }
555                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
556                 {
557                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
558                         return 0;
559                 }
560                 break;
561         }
562         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
563         {
564                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
565                 return 0;
566         }
567         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
568         {
569                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
570                 return 0;
571         }
572         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
573         {
574                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
575                 return 0;
576         }
577         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
578         {
579                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
580                 return 0;
581         }
582         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
583         {
584                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
585                 return 0;
586         }
587         // find first empty slot in texture array
588         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
589                 if (!dpsoftrast.texture[texnum].bytes)
590                         break;
591         dpsoftrast.texture_firstfree = texnum + 1;
592         if (dpsoftrast.texture_max <= texnum)
593                 DPSOFTRAST_Texture_Grow();
594         if (dpsoftrast.texture_end <= texnum)
595                 dpsoftrast.texture_end = texnum + 1;
596         texture = &dpsoftrast.texture[texnum];
597         memset(texture, 0, sizeof(*texture));
598         texture->flags = flags;
599         texture->width = width;
600         texture->height = height;
601         texture->depth = depth;
602         texture->sides = sides;
603         texture->binds = 0;
604         w = width;
605         h = height;
606         d = depth;
607         size = 0;
608         mipmaps = 0;
609         w = width;
610         h = height;
611         d = depth;
612         for (;;)
613         {
614                 s = w * h * d * sides * 4;
615                 texture->mipmap[mipmaps][0] = size;
616                 texture->mipmap[mipmaps][1] = s;
617                 texture->mipmap[mipmaps][2] = w;
618                 texture->mipmap[mipmaps][3] = h;
619                 texture->mipmap[mipmaps][4] = d;
620                 size += s;
621                 mipmaps++;
622                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
623                         break;
624                 if (w > 1) w >>= 1;
625                 if (h > 1) h >>= 1;
626                 if (d > 1) d >>= 1;
627         }
628         texture->mipmaps = mipmaps;
629         texture->size = size;
630
631         // allocate the pixels now
632         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
633
634         return texnum;
635 }
636 void DPSOFTRAST_Texture_Free(int index)
637 {
638         DPSOFTRAST_Texture *texture;
639         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
640         if (texture->binds)
641                 DPSOFTRAST_Flush();
642         if (texture->bytes)
643                 MM_FREE(texture->bytes);
644         texture->bytes = NULL;
645         memset(texture, 0, sizeof(*texture));
646         // adjust the free range and used range
647         if (dpsoftrast.texture_firstfree > index)
648                 dpsoftrast.texture_firstfree = index;
649         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
650                 dpsoftrast.texture_end--;
651 }
652 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
653 {
654         int i, x, y, z, w, layer0, layer1, row0, row1;
655         unsigned char *o, *i0, *i1, *i2, *i3;
656         DPSOFTRAST_Texture *texture;
657         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
658         if (texture->mipmaps <= 1)
659                 return;
660         for (i = 1;i < texture->mipmaps;i++)
661         {
662                 for (z = 0;z < texture->mipmap[i][4];z++)
663                 {
664                         layer0 = z*2;
665                         layer1 = z*2+1;
666                         if (layer1 >= texture->mipmap[i-1][4])
667                                 layer1 = texture->mipmap[i-1][4]-1;
668                         for (y = 0;y < texture->mipmap[i][3];y++)
669                         {
670                                 row0 = y*2;
671                                 row1 = y*2+1;
672                                 if (row1 >= texture->mipmap[i-1][3])
673                                         row1 = texture->mipmap[i-1][3]-1;
674                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
675                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
676                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
677                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
678                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
679                                 w = texture->mipmap[i][2];
680                                 if (layer1 > layer0)
681                                 {
682                                         if (texture->mipmap[i-1][2] > 1)
683                                         {
684                                                 // average 3D texture
685                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
686                                                 {
687                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
688                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
689                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
690                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
691                                                 }
692                                         }
693                                         else
694                                         {
695                                                 // average 3D mipmap with parent width == 1
696                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
697                                                 {
698                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
699                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
700                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
701                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
702                                                 }
703                                         }
704                                 }
705                                 else
706                                 {
707                                         if (texture->mipmap[i-1][2] > 1)
708                                         {
709                                                 // average 2D texture (common case)
710                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
711                                                 {
712                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
713                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
714                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
715                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
716                                                 }
717                                         }
718                                         else
719                                         {
720                                                 // 2D texture with parent width == 1
721                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
722                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
723                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
724                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
725                                         }
726                                 }
727                         }
728                 }
729         }
730 }
731 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
732 {
733         DPSOFTRAST_Texture *texture;
734         unsigned char *dst;
735         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
736         if (texture->binds)
737                 DPSOFTRAST_Flush();
738         if (pixels)
739         {
740                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
741                 while (blockheight > 0)
742                 {
743                         memcpy(dst, pixels, blockwidth * 4);
744                         pixels += blockwidth * 4;
745                         dst += texture->mipmap[0][2] * 4;
746                         blockheight--;
747                 }
748         }
749         DPSOFTRAST_Texture_CalculateMipmaps(index);
750 }
751 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
752 {
753         DPSOFTRAST_Texture *texture;
754         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
755         if (texture->binds)
756                 DPSOFTRAST_Flush();
757         if (pixels)
758                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
759         DPSOFTRAST_Texture_CalculateMipmaps(index);
760 }
761 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
762 {
763         DPSOFTRAST_Texture *texture;
764         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
765         return texture->mipmap[mip][2];
766 }
767 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
768 {
769         DPSOFTRAST_Texture *texture;
770         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
771         return texture->mipmap[mip][3];
772 }
773 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
774 {
775         DPSOFTRAST_Texture *texture;
776         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
777         return texture->mipmap[mip][4];
778 }
779 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
780 {
781         DPSOFTRAST_Texture *texture;
782         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
783         if (texture->binds)
784                 DPSOFTRAST_Flush();
785         return texture->bytes + texture->mipmap[mip][0];
786 }
787 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
788 {
789         DPSOFTRAST_Texture *texture;
790         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
791         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
792         {
793                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
794                 return;
795         }
796         if (texture->binds)
797                 DPSOFTRAST_Flush();
798         texture->filter = filter;
799 }
800
801 static void DPSOFTRAST_Draw_FlushThreads(void);
802
803 static void DPSOFTRAST_Draw_SyncCommands(void)
804 {
805         if(dpsoftrast.usethreads) MEMORY_BARRIER;
806         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
807 }
808
809 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
810 {
811         DPSOFTRAST_State_Thread *thread;
812         int i;
813         int freecommand = dpsoftrast.commandpool.freecommand;
814         int usedcommands = dpsoftrast.commandpool.usedcommands;
815         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
816                 return;
817         DPSOFTRAST_Draw_SyncCommands();
818         for(;;)
819         {
820                 int waitindex = -1;
821                 int commandoffset;
822                 usedcommands = 0;
823                 for (i = 0; i < dpsoftrast.numthreads; i++)
824                 {
825                         thread = &dpsoftrast.threads[i]; 
826                         commandoffset = freecommand - thread->commandoffset;
827                         if (commandoffset < 0)
828                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
829                         if (commandoffset > usedcommands)
830                         {
831                                 waitindex = i;
832                                 usedcommands = commandoffset;
833                         }
834                 }
835                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
836                         break;
837                 thread = &dpsoftrast.threads[waitindex];
838                 Thread_LockMutex(thread->drawmutex);
839                 if (thread->commandoffset != dpsoftrast.drawcommand)
840                 {
841                         thread->waiting = true;
842                         if (thread->starving) Thread_CondSignal(thread->drawcond);
843                         Thread_CondWait(thread->waitcond, thread->drawmutex);
844                         thread->waiting = false;
845                 }
846                 Thread_UnlockMutex(thread->drawmutex);
847         }
848         dpsoftrast.commandpool.usedcommands = usedcommands;
849 }
850
851 #define DPSOFTRAST_ALIGNCOMMAND(size) \
852         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
853 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
854         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
855
856 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
857 {
858         DPSOFTRAST_Command *command;
859         int freecommand = dpsoftrast.commandpool.freecommand;
860         int usedcommands = dpsoftrast.commandpool.usedcommands;
861         int extra = sizeof(DPSOFTRAST_Command);
862         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
863                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
864         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
865         {
866                 if (dpsoftrast.usethreads)
867                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
868                 else
869                         DPSOFTRAST_Draw_FlushThreads();
870                 freecommand = dpsoftrast.commandpool.freecommand;
871                 usedcommands = dpsoftrast.commandpool.usedcommands;
872         }
873         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
874         {
875                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
876                 command->opcode = DPSOFTRAST_OPCODE_Reset;
877                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
878                 freecommand = 0;
879         }
880         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
881         command->opcode = opcode;
882         command->commandsize = size;
883         freecommand += size;
884         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
885                 freecommand = 0;
886         dpsoftrast.commandpool.freecommand = freecommand;
887         dpsoftrast.commandpool.usedcommands = usedcommands + size;
888         return command;
889 }
890
891 static void DPSOFTRAST_UndoCommand(int size)
892 {
893         int freecommand = dpsoftrast.commandpool.freecommand;
894         int usedcommands = dpsoftrast.commandpool.usedcommands;
895         freecommand -= size;
896         if (freecommand < 0)
897                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
898         usedcommands -= size;
899         dpsoftrast.commandpool.freecommand = freecommand;
900         dpsoftrast.commandpool.usedcommands = usedcommands;
901 }
902                 
903 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
904 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
905 {
906         thread->viewport[0] = command->x;
907         thread->viewport[1] = command->y;
908         thread->viewport[2] = command->width;
909         thread->viewport[3] = command->height;
910         thread->validate |= DPSOFTRAST_VALIDATE_FB;
911 }
912 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
913 {
914         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
915         command->x = x;
916         command->y = y;
917         command->width = width;
918         command->height = height;
919
920         dpsoftrast.viewport[0] = x;
921         dpsoftrast.viewport[1] = y;
922         dpsoftrast.viewport[2] = width;
923         dpsoftrast.viewport[3] = height;
924         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
925 }
926
927 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
928 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
929 {
930         int i, x1, y1, x2, y2, w, h, x, y;
931         int miny1, maxy1, miny2, maxy2;
932         int bandy;
933         unsigned int *p;
934         unsigned int c;
935         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
936         miny1 = thread->miny1;
937         maxy1 = thread->maxy1;
938         miny2 = thread->miny2;
939         maxy2 = thread->maxy2;
940         x1 = thread->fb_scissor[0];
941         y1 = thread->fb_scissor[1];
942         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
943         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
944         if (y1 < miny1) y1 = miny1;
945         if (y2 > maxy2) y2 = maxy2;
946         w = x2 - x1;
947         h = y2 - y1;
948         if (w < 1 || h < 1)
949                 return;
950         // FIXME: honor fb_colormask?
951         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
952         for (i = 0;i < 4;i++)
953         {
954                 if (!dpsoftrast.fb_colorpixels[i])
955                         continue;
956                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
957                 for (;y < bandy;y++)
958                 {
959                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
960                         for (x = x1;x < x2;x++)
961                                 p[x] = c;
962                 }
963         }
964 }
965 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
966 {
967         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
968         command->r = r;
969         command->g = g;
970         command->b = b;
971         command->a = a;
972 }
973
974 DEFCOMMAND(3, ClearDepth, float depth;)
975 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
976 {
977         int x1, y1, x2, y2, w, h, x, y;
978         int miny1, maxy1, miny2, maxy2;
979         int bandy;
980         unsigned int *p;
981         unsigned int c;
982         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
983         miny1 = thread->miny1;
984         maxy1 = thread->maxy1;
985         miny2 = thread->miny2;
986         maxy2 = thread->maxy2;
987         x1 = thread->fb_scissor[0];
988         y1 = thread->fb_scissor[1];
989         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
990         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
991         if (y1 < miny1) y1 = miny1;
992         if (y2 > maxy2) y2 = maxy2;
993         w = x2 - x1;
994         h = y2 - y1;
995         if (w < 1 || h < 1)
996                 return;
997         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
998         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
999         for (;y < bandy;y++)
1000         {
1001                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1002                 for (x = x1;x < x2;x++)
1003                         p[x] = c;
1004         }
1005 }
1006 void DPSOFTRAST_ClearDepth(float d)
1007 {
1008         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1009         command->depth = d;
1010 }
1011
1012 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1013 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1014 {
1015         thread->colormask[0] = command->r != 0;
1016         thread->colormask[1] = command->g != 0;
1017         thread->colormask[2] = command->b != 0;
1018         thread->colormask[3] = command->a != 0;
1019         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1020 }
1021 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1022 {
1023         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1024         command->r = r;
1025         command->g = g;
1026         command->b = b;
1027         command->a = a;
1028 }
1029
1030 DEFCOMMAND(5, DepthTest, int enable;)
1031 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1032 {
1033         thread->depthtest = command->enable;
1034         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1035 }
1036 void DPSOFTRAST_DepthTest(int enable)
1037 {
1038         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1039         command->enable = enable;
1040 }
1041
1042 DEFCOMMAND(6, ScissorTest, int enable;)
1043 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1044 {
1045         thread->scissortest = command->enable;
1046         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1047 }
1048 void DPSOFTRAST_ScissorTest(int enable)
1049 {
1050         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1051         command->enable = enable;
1052 }
1053
1054 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1055 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1056 {
1057         thread->scissor[0] = command->x;
1058         thread->scissor[1] = command->y;
1059         thread->scissor[2] = command->width;
1060         thread->scissor[3] = command->height;
1061         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1062 }
1063 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1064 {
1065         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1066         command->x = x;
1067         command->y = y;
1068         command->width = width;
1069         command->height = height;
1070 }
1071
1072 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1073 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1074 {
1075         thread->blendfunc[0] = command->sfactor;
1076         thread->blendfunc[1] = command->dfactor;
1077         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1078 }
1079 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1080 {
1081         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1082         command->sfactor = sfactor;
1083         command->dfactor = dfactor;
1084 }
1085
1086 DEFCOMMAND(9, BlendSubtract, int enable;)
1087 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1088 {
1089         thread->blendsubtract = command->enable;
1090         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1091 }
1092 void DPSOFTRAST_BlendSubtract(int enable)
1093 {
1094         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1095         command->enable = enable;
1096 }
1097
1098 DEFCOMMAND(10, DepthMask, int enable;)
1099 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1100 {
1101         thread->depthmask = command->enable;
1102 }
1103 void DPSOFTRAST_DepthMask(int enable)
1104 {
1105         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1106         command->enable = enable;
1107 }
1108
1109 DEFCOMMAND(11, DepthFunc, int func;)
1110 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1111 {
1112         thread->depthfunc = command->func;
1113 }
1114 void DPSOFTRAST_DepthFunc(int func)
1115 {
1116         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1117         command->func = func;
1118 }
1119
1120 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1121 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1122 {
1123         thread->depthrange[0] = command->nearval;
1124         thread->depthrange[1] = command->farval;
1125 }
1126 void DPSOFTRAST_DepthRange(float nearval, float farval)
1127 {
1128         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1129         command->nearval = nearval;
1130         command->farval = farval;
1131 }
1132
1133 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1134 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1135 {
1136         thread->polygonoffset[0] = command->alongnormal;
1137         thread->polygonoffset[1] = command->intoview;
1138 }
1139 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1140 {
1141         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1142         command->alongnormal = alongnormal;
1143         command->intoview = intoview;
1144 }
1145
1146 DEFCOMMAND(14, CullFace, int mode;)
1147 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1148 {
1149         thread->cullface = command->mode;
1150 }
1151 void DPSOFTRAST_CullFace(int mode)
1152 {
1153         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1154         command->mode = mode;
1155 }
1156
1157 DEFCOMMAND(15, AlphaTest, int enable;)
1158 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1159 {
1160         thread->alphatest = command->enable;
1161 }
1162 void DPSOFTRAST_AlphaTest(int enable)
1163 {
1164         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1165         command->enable = enable;
1166 }
1167
1168 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1169 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1170 {
1171         thread->alphafunc = command->func;
1172         thread->alphavalue = command->ref;
1173 }
1174 void DPSOFTRAST_AlphaFunc(int func, float ref)
1175 {
1176         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1177         command->func = func;
1178         command->ref = ref;
1179 }
1180
1181 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1182 {
1183         dpsoftrast.color[0] = r;
1184         dpsoftrast.color[1] = g;
1185         dpsoftrast.color[2] = b;
1186         dpsoftrast.color[3] = a;
1187 }
1188
1189 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1190 {
1191         int outstride = blockwidth * 4;
1192         int instride = dpsoftrast.fb_width * 4;
1193         int bx1 = blockx;
1194         int by1 = blocky;
1195         int bx2 = blockx + blockwidth;
1196         int by2 = blocky + blockheight;
1197         int bw;
1198         int x;
1199         int y;
1200         unsigned char *inpixels;
1201         unsigned char *b;
1202         unsigned char *o;
1203         DPSOFTRAST_Flush();
1204         if (bx1 < 0) bx1 = 0;
1205         if (by1 < 0) by1 = 0;
1206         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1207         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1208         bw = bx2 - bx1;
1209         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1210         if (dpsoftrast.bigendian)
1211         {
1212                 for (y = by1;y < by2;y++)
1213                 {
1214                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1215                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1216                         for (x = bx1;x < bx2;x++)
1217                         {
1218                                 o[0] = b[3];
1219                                 o[1] = b[2];
1220                                 o[2] = b[1];
1221                                 o[3] = b[0];
1222                                 o += 4;
1223                                 b += 4;
1224                         }
1225                 }
1226         }
1227         else
1228         {
1229                 for (y = by1;y < by2;y++)
1230                 {
1231                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1232                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1233                         memcpy(o, b, bw*4);
1234                 }
1235         }
1236
1237 }
1238 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1239 {
1240         int tx1 = tx;
1241         int ty1 = ty;
1242         int tx2 = tx + width;
1243         int ty2 = ty + height;
1244         int sx1 = sx;
1245         int sy1 = sy;
1246         int sx2 = sx + width;
1247         int sy2 = sy + height;
1248         int swidth;
1249         int sheight;
1250         int twidth;
1251         int theight;
1252         int sw;
1253         int sh;
1254         int tw;
1255         int th;
1256         int y;
1257         unsigned int *spixels;
1258         unsigned int *tpixels;
1259         DPSOFTRAST_Texture *texture;
1260         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1261         if (mip < 0 || mip >= texture->mipmaps) return;
1262         DPSOFTRAST_Flush();
1263         spixels = dpsoftrast.fb_colorpixels[0];
1264         swidth = dpsoftrast.fb_width;
1265         sheight = dpsoftrast.fb_height;
1266         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1267         twidth = texture->mipmap[mip][2];
1268         theight = texture->mipmap[mip][3];
1269         if (tx1 < 0) tx1 = 0;
1270         if (ty1 < 0) ty1 = 0;
1271         if (tx2 > twidth) tx2 = twidth;
1272         if (ty2 > theight) ty2 = theight;
1273         if (sx1 < 0) sx1 = 0;
1274         if (sy1 < 0) sy1 = 0;
1275         if (sx2 > swidth) sx2 = swidth;
1276         if (sy2 > sheight) sy2 = sheight;
1277         tw = tx2 - tx1;
1278         th = ty2 - ty1;
1279         sw = sx2 - sx1;
1280         sh = sy2 - sy1;
1281         if (tw > sw) tw = sw;
1282         if (th > sh) th = sh;
1283         if (tw < 1 || th < 1)
1284                 return;
1285         sy1 = sheight - 1 - sy1;
1286         for (y = 0;y < th;y++)
1287                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1288         if (texture->mipmaps > 1)
1289                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1290 }
1291
1292 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1293 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1294 {
1295         if (thread->texbound[command->unitnum])
1296                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1297         thread->texbound[command->unitnum] = command->texture;
1298 }
1299 void DPSOFTRAST_SetTexture(int unitnum, int index)
1300 {
1301         DPSOFTRAST_Command_SetTexture *command;
1302         DPSOFTRAST_Texture *texture;
1303         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1304         {
1305                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1306                 return;
1307         }
1308         texture = DPSOFTRAST_Texture_GetByIndex(index);
1309         if (index && !texture)
1310         {
1311                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1312                 return;
1313         }
1314
1315         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1316         command->unitnum = unitnum;
1317         command->texture = texture;
1318
1319         dpsoftrast.texbound[unitnum] = texture;
1320         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1321 }
1322
1323 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1324 {
1325         dpsoftrast.pointer_vertex3f = vertex3f;
1326         dpsoftrast.stride_vertex = stride;
1327 }
1328 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1329 {
1330         dpsoftrast.pointer_color4f = color4f;
1331         dpsoftrast.pointer_color4ub = NULL;
1332         dpsoftrast.stride_color = stride;
1333 }
1334 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1335 {
1336         dpsoftrast.pointer_color4f = NULL;
1337         dpsoftrast.pointer_color4ub = color4ub;
1338         dpsoftrast.stride_color = stride;
1339 }
1340 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1341 {
1342         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1343         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1344         dpsoftrast.stride_texcoord[unitnum] = stride;
1345 }
1346
1347 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1348 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1349 {
1350         thread->shader_mode = command->mode;
1351         thread->shader_permutation = command->permutation;
1352         thread->shader_exactspecularmath = command->exactspecularmath;
1353 }
1354 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1355 {
1356         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1357         command->mode = mode;
1358         command->permutation = permutation;
1359         command->exactspecularmath = exactspecularmath;
1360
1361         dpsoftrast.shader_mode = mode;
1362         dpsoftrast.shader_permutation = permutation;
1363         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1364 }
1365
1366 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1367 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1368 {
1369         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1370 }
1371 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1372 {
1373         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1374         command->index = index;
1375         command->val[0] = v0;
1376         command->val[1] = v1;
1377         command->val[2] = v2;
1378         command->val[3] = v3;
1379
1380         dpsoftrast.uniform4f[index*4+0] = v0;
1381         dpsoftrast.uniform4f[index*4+1] = v1;
1382         dpsoftrast.uniform4f[index*4+2] = v2;
1383         dpsoftrast.uniform4f[index*4+3] = v3;
1384 }
1385 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1386 {
1387         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1388         command->index = index;
1389         memcpy(command->val, v, sizeof(command->val));
1390
1391         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1392 }
1393
1394 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1395 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1396 {
1397         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1398 }
1399 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1400 {
1401 #ifdef SSE_POSSIBLE
1402         int i, index;
1403         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1404         {
1405                 __m128 m0, m1, m2, m3;
1406                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1407                 command->index = (DPSOFTRAST_UNIFORM)index;
1408                 if (((size_t)v)&(ALIGN_SIZE-1))
1409                 {
1410                         m0 = _mm_loadu_ps(v);
1411                         m1 = _mm_loadu_ps(v+4);
1412                         m2 = _mm_loadu_ps(v+8);
1413                         m3 = _mm_loadu_ps(v+12);
1414                 }
1415                 else
1416                 {
1417                         m0 = _mm_load_ps(v);
1418                         m1 = _mm_load_ps(v+4);
1419                         m2 = _mm_load_ps(v+8);
1420                         m3 = _mm_load_ps(v+12);
1421                 }
1422                 if (transpose)
1423                 {
1424                         __m128 t0, t1, t2, t3;
1425                         t0 = _mm_unpacklo_ps(m0, m1);
1426                         t1 = _mm_unpacklo_ps(m2, m3);
1427                         t2 = _mm_unpackhi_ps(m0, m1);
1428                         t3 = _mm_unpackhi_ps(m2, m3);
1429                         m0 = _mm_movelh_ps(t0, t1);
1430                         m1 = _mm_movehl_ps(t1, t0);
1431                         m2 = _mm_movelh_ps(t2, t3);
1432                         m3 = _mm_movehl_ps(t3, t2);                     
1433                 }
1434                 _mm_store_ps(command->val, m0);
1435                 _mm_store_ps(command->val+4, m1);
1436                 _mm_store_ps(command->val+8, m2);
1437                 _mm_store_ps(command->val+12, m3);
1438                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1439                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1440                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1441                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1442         }
1443 #endif
1444 }
1445
1446 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1447 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1448 {
1449         thread->uniform1i[command->index] = command->val;
1450 }
1451 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1452 {
1453         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1454         command->index = index;
1455         command->val = i0;
1456
1457         dpsoftrast.uniform1i[command->index] = i0;
1458 }
1459
1460 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1461 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1462 {
1463         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1464         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1465 }
1466 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1467 {
1468         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1469         command->clipplane[0] = x;
1470         command->clipplane[1] = y;
1471         command->clipplane[2] = z;
1472         command->clipplane[3] = w;
1473 }
1474
1475 #ifdef SSE_POSSIBLE
1476 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1477 {
1478         float *end = dst + size*4;
1479         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1480         {
1481                 while (dst < end)
1482                 {
1483                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1484                         dst += 4;
1485                         src += stride;
1486                 }
1487         }
1488         else
1489         {
1490                 while (dst < end)
1491                 {
1492                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1493                         dst += 4;
1494                         src += stride;
1495                 }
1496         }
1497 }
1498
1499 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1500 {
1501         float *end = dst + size*4;
1502         if (stride == sizeof(float[3]))
1503         {
1504                 float *end4 = dst + (size&~3)*4;        
1505                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1506                 {
1507                         while (dst < end4)
1508                         {
1509                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1510                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1511                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1512                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1513                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1514                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1517                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1518                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1519                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1521                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1522                                 dst += 16;
1523                                 src += 4*sizeof(float[3]);
1524                         }
1525                 }
1526                 else
1527                 {
1528                         while (dst < end4)
1529                         {
1530                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1531                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1532                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1533                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1534                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1535                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1538                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1539                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1540                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1541                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1542                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1543                                 dst += 16;
1544                                 src += 4*sizeof(float[3]);
1545                         }
1546                 }
1547         }
1548         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1549         {
1550                 while (dst < end)
1551                 {
1552                         __m128 v = _mm_loadu_ps((const float *)src);
1553                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1554                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1555                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1556                         _mm_store_ps(dst, v);
1557                         dst += 4;
1558                         src += stride;
1559                 }
1560         }
1561         else
1562         {
1563                 while (dst < end)
1564                 {
1565                         __m128 v = _mm_load_ps((const float *)src);
1566                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1567                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1568                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1569                         _mm_store_ps(dst, v);
1570                         dst += 4;
1571                         src += stride;
1572                 }
1573         }
1574 }
1575
1576 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1577 {
1578         float *end = dst + size*4;
1579         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1580         if (stride == sizeof(float[2]))
1581         {
1582                 float *end2 = dst + (size&~1)*4;
1583                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1584                 {
1585                         while (dst < end2)
1586                         {
1587                                 __m128 v = _mm_loadu_ps((const float *)src);
1588                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1589                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1590                                 dst += 8;
1591                                 src += 2*sizeof(float[2]);
1592                         }
1593                 }
1594                 else
1595                 {
1596                         while (dst < end2)
1597                         {
1598                                 __m128 v = _mm_load_ps((const float *)src);
1599                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1600                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1601                                 dst += 8;
1602                                 src += 2*sizeof(float[2]);
1603                         }
1604                 }
1605         }
1606         while (dst < end)
1607         {
1608                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1609                 dst += 4;
1610                 src += stride;
1611         }
1612 }
1613
1614 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1615 {
1616         float *end = dst + size*4;
1617         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1618         if (stride == sizeof(unsigned char[4]))
1619         {
1620                 float *end4 = dst + (size&~3)*4;
1621                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1622                 {
1623                         while (dst < end4)
1624                         {
1625                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1626                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1627                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1628                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1629                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1630                                 dst += 16;
1631                                 src += 4*sizeof(unsigned char[4]);
1632                         }
1633                 }
1634                 else
1635                 {
1636                         while (dst < end4)
1637                         {
1638                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1639                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1640                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1641                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1642                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1643                                 dst += 16;
1644                                 src += 4*sizeof(unsigned char[4]);
1645                         }
1646                 }
1647         }
1648         while (dst < end)
1649         {
1650                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1651                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1652                 dst += 4;
1653                 src += stride;
1654         }
1655 }
1656
1657 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1658 {
1659         float *end = dst + 4*size;
1660         __m128 v = _mm_loadu_ps(src);
1661         while (dst < end)
1662         {
1663                 _mm_store_ps(dst, v);
1664                 dst += 4;
1665         }
1666 }
1667 #endif
1668
1669 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1670 {
1671 #ifdef SSE_POSSIBLE
1672         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1673         __m128 m0, m1, m2, m3;
1674         float *end;
1675         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1676         {
1677                 // fast case for identity matrix
1678                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1679                 return;
1680         }
1681         end = out4f + numitems*4;
1682         m0 = _mm_loadu_ps(inmatrix16f);
1683         m1 = _mm_loadu_ps(inmatrix16f + 4);
1684         m2 = _mm_loadu_ps(inmatrix16f + 8);
1685         m3 = _mm_loadu_ps(inmatrix16f + 12);
1686         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1687         {
1688                 while (out4f < end)
1689                 {
1690                         __m128 v = _mm_loadu_ps(in4f);
1691                         _mm_store_ps(out4f,
1692                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1693                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1694                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1695                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1696                         out4f += 4;
1697                         in4f += 4;
1698                 }
1699         }
1700         else
1701         {
1702                 while (out4f < end)
1703                 {
1704                         __m128 v = _mm_load_ps(in4f);
1705                         _mm_store_ps(out4f,
1706                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1707                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1708                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1709                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1710                         out4f += 4;
1711                         in4f += 4;
1712                 }
1713         }
1714 #endif
1715 }
1716
1717 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1718 {
1719         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1720 }
1721
1722 #ifdef SSE_POSSIBLE
1723 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1724 { \
1725         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1726         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1727         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1728         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1729 }
1730
1731 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1732 { \
1733         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1734         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1735         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1736         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1737 }
1738
1739 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1740 { \
1741         __m128 p = (in); \
1742         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1743                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1744                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1745                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1746 }
1747
1748 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1749 {
1750         int clipmask = 0xFF;
1751         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1752         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1753         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1754         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1755         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1756         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1757         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1758         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1759         #define BBFRONT(k, pos) \
1760         { \
1761                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1762                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1763                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1764                 { \
1765                         __m128 proj; \
1766                         clipmask &= ~(1<<k); \
1767                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1768                         minproj = _mm_min_ss(minproj, proj); \
1769                         maxproj = _mm_max_ss(maxproj, proj); \
1770                 } \
1771         }
1772         BBFRONT(0, minpos); 
1773         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1774         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1775         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1776         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1777         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1778         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1779         BBFRONT(7, maxpos);
1780         #define BBCLIP(k) \
1781         { \
1782                 if (clipmask&(1<<k)) \
1783                 { \
1784                         if (!(clipmask&(1<<(k^1)))) \
1785                         { \
1786                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1787                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1788                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1789                                 minproj = _mm_min_ss(minproj, proj); \
1790                                 maxproj = _mm_max_ss(maxproj, proj); \
1791                         } \
1792                         if (!(clipmask&(1<<(k^2)))) \
1793                         { \
1794                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1795                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1796                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1797                                 minproj = _mm_min_ss(minproj, proj); \
1798                                 maxproj = _mm_max_ss(maxproj, proj); \
1799                         } \
1800                         if (!(clipmask&(1<<(k^4)))) \
1801                         { \
1802                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1803                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1804                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1805                                 minproj = _mm_min_ss(minproj, proj); \
1806                                 maxproj = _mm_max_ss(maxproj, proj); \
1807                         } \
1808                 } \
1809         }
1810         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1811         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1812         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1813         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1814         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1815         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1816         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1817         *starty = _mm_cvttss_si32(maxproj);
1818         *endy = _mm_cvttss_si32(minproj)+1;
1819         return clipmask;
1820 }
1821         
1822 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1823 {
1824         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1825         float *end = out4f + numitems*4;
1826         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1827         __m128 minpos, maxpos;
1828         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1829         {
1830                 minpos = maxpos = _mm_loadu_ps(in4f);
1831                 while (out4f < end)
1832                 {
1833                         __m128 v = _mm_loadu_ps(in4f);
1834                         minpos = _mm_min_ps(minpos, v);
1835                         maxpos = _mm_max_ps(maxpos, v);
1836                         _mm_store_ps(out4f, v);
1837                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1838                         _mm_store_ps(screen4f, v);
1839                         in4f += 4;
1840                         out4f += 4;
1841                         screen4f += 4;
1842                 }
1843         }
1844         else
1845         {
1846                 minpos = maxpos = _mm_load_ps(in4f);
1847                 while (out4f < end)
1848                 {
1849                         __m128 v = _mm_load_ps(in4f);
1850                         minpos = _mm_min_ps(minpos, v);
1851                         maxpos = _mm_max_ps(maxpos, v);
1852                         _mm_store_ps(out4f, v);
1853                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1854                         _mm_store_ps(screen4f, v);
1855                         in4f += 4;
1856                         out4f += 4;
1857                         screen4f += 4;
1858                 }
1859         }
1860         if (starty && endy) 
1861         {
1862                 ALIGN(float minposf[4]);
1863                 ALIGN(float maxposf[4]);
1864                 _mm_store_ps(minposf, minpos);
1865                 _mm_store_ps(maxposf, maxpos);
1866                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1867         }
1868         return 0;
1869 }
1870
1871 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1872 {
1873         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1874         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1875         float *end;
1876         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1877                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1878         end = out4f + numitems*4;
1879         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1880         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1881         m0 = _mm_loadu_ps(inmatrix16f);
1882         m1 = _mm_loadu_ps(inmatrix16f + 4);
1883         m2 = _mm_loadu_ps(inmatrix16f + 8);
1884         m3 = _mm_loadu_ps(inmatrix16f + 12);
1885         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1886         {
1887                 minpos = maxpos = _mm_loadu_ps(in4f);
1888                 while (out4f < end)
1889                 {
1890                         __m128 v = _mm_loadu_ps(in4f);
1891                         minpos = _mm_min_ps(minpos, v);
1892                         maxpos = _mm_max_ps(maxpos, v);
1893                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894                         _mm_store_ps(out4f, v);
1895                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896                         _mm_store_ps(screen4f, v);
1897                         in4f += 4;
1898                         out4f += 4;
1899                         screen4f += 4;
1900                 }
1901         }
1902         else
1903         {
1904                 minpos = maxpos = _mm_load_ps(in4f);
1905                 while (out4f < end)
1906                 {
1907                         __m128 v = _mm_load_ps(in4f);
1908                         minpos = _mm_min_ps(minpos, v);
1909                         maxpos = _mm_max_ps(maxpos, v);
1910                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1911                         _mm_store_ps(out4f, v);
1912                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1913                         _mm_store_ps(screen4f, v);
1914                         in4f += 4;
1915                         out4f += 4;
1916                         screen4f += 4;
1917                 }
1918         }
1919         if (starty && endy) 
1920         {
1921                 ALIGN(float minposf[4]);
1922                 ALIGN(float maxposf[4]);
1923                 _mm_store_ps(minposf, minpos);
1924                 _mm_store_ps(maxposf, maxpos);
1925                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1926         }
1927         return 0;
1928 }
1929 #endif
1930
1931 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1932 {
1933 #ifdef SSE_POSSIBLE
1934         float *outf = dpsoftrast.post_array4f[outarray];
1935         const unsigned char *inb;
1936         int firstvertex = dpsoftrast.firstvertex;
1937         int numvertices = dpsoftrast.numvertices;
1938         int stride;
1939         switch(inarray)
1940         {
1941         case DPSOFTRAST_ARRAY_POSITION:
1942                 stride = dpsoftrast.stride_vertex;
1943                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1944                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1945                 break;
1946         case DPSOFTRAST_ARRAY_COLOR:
1947                 stride = dpsoftrast.stride_color;
1948                 if (dpsoftrast.pointer_color4f)
1949                 {
1950                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1951                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1952                 }
1953                 else if (dpsoftrast.pointer_color4ub)
1954                 {
1955                         stride = dpsoftrast.stride_color;
1956                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1957                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1958                 }
1959                 else
1960                 {
1961                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1962                 }
1963                 break;
1964         default:
1965                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1966                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1967                 {
1968                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1969                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1970                         {
1971                         case 2:
1972                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1973                                 break;
1974                         case 3:
1975                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1976                                 break;
1977                         case 4:
1978                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1979                                 break;
1980                         }
1981                 }
1982                 break;
1983         }
1984         return outf;
1985 #else
1986         return NULL;
1987 #endif
1988 }
1989
1990 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1991 {
1992         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1993         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1994         return data;
1995 }
1996
1997 #if 0
1998 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1999 {
2000 #ifdef SSE_POSSIBLE
2001         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2003         return data;
2004 #else
2005         return NULL;
2006 #endif
2007 }
2008 #endif
2009
2010 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2011 {
2012 #ifdef SSE_POSSIBLE
2013         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2014         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2015         return data;
2016 #else
2017         return NULL;
2018 #endif
2019 }
2020
2021 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2022 {
2023         int x;
2024         int startx = span->startx;
2025         int endx = span->endx;
2026         float wslope = triangle->w[0];
2027         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2028         float endz = 1.0f / (w + wslope * startx);
2029         if (triangle->w[0] == 0)
2030         {
2031                 // LordHavoc: fast flat polygons (HUD/menu)
2032                 for (x = startx;x < endx;x++)
2033                         zf[x] = endz;
2034                 return;
2035         }
2036         for (x = startx;x < endx;)
2037         {
2038                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2039                 float z = endz, dz;
2040                 if (nextsub >= endx) nextsub = endsub = endx-1;
2041                 endz = 1.0f / (w + wslope * nextsub);
2042                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2043                 for (; x <= endsub; x++, z += dz)
2044                         zf[x] = z;
2045         }
2046 }
2047
2048 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2049 {
2050 #ifdef SSE_POSSIBLE
2051         int x;
2052         int startx = span->startx;
2053         int endx = span->endx;
2054         int maskx;
2055         int subx;
2056         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2057         unsigned char * RESTRICT pixelmask = span->pixelmask;
2058         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2059         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2060         if (!pixel)
2061                 return;
2062         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2063         pixeli += span->y * dpsoftrast.fb_width + span->x;
2064         // handle alphatest now (this affects depth writes too)
2065         if (thread->alphatest)
2066                 for (x = startx;x < endx;x++)
2067                         if (in4ub[x*4+3] < 128)
2068                                 pixelmask[x] = false;
2069         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2070         // helps sprites, text and hud artwork
2071         switch(thread->fb_blendmode)
2072         {
2073         case DPSOFTRAST_BLENDMODE_ALPHA:
2074         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2075         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2076                 maskx = startx;
2077                 for (x = startx;x < endx;x++)
2078                 {
2079                         if (in4ub[x*4+3] >= 1)
2080                         {
2081                                 startx = x;
2082                                 for (;;)
2083                                 {
2084                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2085                                         maskx = x;
2086                                         if (x >= endx) break;
2087                                         ++x;
2088                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2089                                         if (x >= endx) break;
2090                                 }
2091                                 break;
2092                         }
2093                 }
2094                 endx = maskx;
2095                 break;
2096         case DPSOFTRAST_BLENDMODE_OPAQUE:
2097         case DPSOFTRAST_BLENDMODE_ADD:
2098         case DPSOFTRAST_BLENDMODE_INVMOD:
2099         case DPSOFTRAST_BLENDMODE_MUL:
2100         case DPSOFTRAST_BLENDMODE_MUL2:
2101         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2102         case DPSOFTRAST_BLENDMODE_INVADD:
2103                 break;
2104         }
2105         // put some special values at the end of the mask to ensure the loops end
2106         pixelmask[endx] = 1;
2107         pixelmask[endx+1] = 0;
2108         // LordHavoc: use a double loop to identify subspans, this helps the
2109         // optimized copy/blend loops to perform at their best, most triangles
2110         // have only one run of pixels, and do the search using wide reads...
2111         x = startx;
2112         while (x < endx)
2113         {
2114                 // if this pixel is masked off, it's probably not alone...
2115                 if (!pixelmask[x])
2116                 {
2117                         x++;
2118 #if 1
2119                         if (x + 8 < endx)
2120                         {
2121                                 // the 4-item search must be aligned or else it stalls badly
2122                                 if ((x & 3) && !pixelmask[x]) 
2123                                 {
2124                                         if(pixelmask[x]) goto endmasked;
2125                                         x++;
2126                                         if (x & 3)
2127                                         {
2128                                                 if(pixelmask[x]) goto endmasked;
2129                                                 x++;
2130                                                 if (x & 3)
2131                                                 {
2132                                                         if(pixelmask[x]) goto endmasked;
2133                                                         x++;
2134                                                 }
2135                                         }
2136                                 }
2137                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2138                                         x += 4;
2139                         }
2140 #endif
2141                         for (;!pixelmask[x];x++)
2142                                 ;
2143                         // rather than continue the loop, just check the end variable
2144                         if (x >= endx)
2145                                 break;
2146                 }
2147         endmasked:
2148                 // find length of subspan
2149                 subx = x + 1;
2150 #if 1
2151                 if (subx + 8 < endx)
2152                 {
2153                         if (subx & 3)
2154                         {
2155                                 if(!pixelmask[subx]) goto endunmasked;
2156                                 subx++;
2157                                 if (subx & 3)
2158                                 {
2159                                         if(!pixelmask[subx]) goto endunmasked;
2160                                         subx++;
2161                                         if (subx & 3)
2162                                         {
2163                                                 if(!pixelmask[subx]) goto endunmasked;
2164                                                 subx++;
2165                                         }
2166                                 }
2167                         }
2168                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2169                                 subx += 4;
2170                 }
2171 #endif
2172                 for (;pixelmask[subx];subx++)
2173                         ;
2174                 // the checks can overshoot, so make sure to clip it...
2175                 if (subx > endx)
2176                         subx = endx;
2177         endunmasked:
2178                 // now that we know the subspan length...  process!
2179                 switch(thread->fb_blendmode)
2180                 {
2181                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2182 #if 0
2183                         if (subx - x >= 16)
2184                         {
2185                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2186                                 x = subx;
2187                         }
2188                         else
2189 #elif 1
2190                         while (x + 16 <= subx)
2191                         {
2192                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2193                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2194                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2195                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2196                                 x += 16;
2197                         }
2198 #endif
2199                         {
2200                                 while (x + 4 <= subx)
2201                                 {
2202                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2203                                         x += 4;
2204                                 }
2205                                 if (x + 2 <= subx)
2206                                 {
2207                                         pixeli[x] = ini[x];
2208                                         pixeli[x+1] = ini[x+1];
2209                                         x += 2;
2210                                 }
2211                                 if (x < subx)
2212                                 {
2213                                         pixeli[x] = ini[x];
2214                                         x++;
2215                                 }
2216                         }
2217                         break;
2218                 case DPSOFTRAST_BLENDMODE_ALPHA:
2219                 #define FINISHBLEND(blend2, blend1) \
2220                         for (;x + 1 < subx;x += 2) \
2221                         { \
2222                                 __m128i src, dst; \
2223                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2224                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2225                                 blend2; \
2226                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2227                         } \
2228                         if (x < subx) \
2229                         { \
2230                                 __m128i src, dst; \
2231                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2232                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2233                                 blend1; \
2234                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2235                                 x++; \
2236                         }
2237                         FINISHBLEND({
2238                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2239                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2240                         }, {
2241                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2242                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2243                         });
2244                         break;
2245                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2246                         FINISHBLEND({
2247                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2248                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2249                         }, {
2250                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2251                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2252                         });
2253                         break;
2254                 case DPSOFTRAST_BLENDMODE_ADD:
2255                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2256                         break;
2257                 case DPSOFTRAST_BLENDMODE_INVMOD:
2258                         FINISHBLEND({
2259                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2260                         }, {
2261                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2262                         });
2263                         break;
2264                 case DPSOFTRAST_BLENDMODE_MUL:
2265                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2266                         break;
2267                 case DPSOFTRAST_BLENDMODE_MUL2:
2268                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2269                         break;
2270                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2271                         FINISHBLEND({
2272                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2273                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2274                         }, {
2275                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2276                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2277                         });
2278                         break;
2279                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2280                         FINISHBLEND({
2281                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2282                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2283                         }, {
2284                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2285                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2286                         });
2287                         break;
2288                 case DPSOFTRAST_BLENDMODE_INVADD:
2289                         FINISHBLEND({
2290                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2291                         }, {
2292                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2293                         });
2294                         break;
2295                 }
2296         }
2297 #endif
2298 }
2299
2300 static void DPSOFTRAST_Texture2D(DPSOFTRAST_Texture *texture, int mip, float x, float y, float c[4])
2301         // warning: this is SLOW, only use if the optimized per-span functions won't do
2302         // FIXME does this function need flipping of the color order?
2303 {
2304         const unsigned char * RESTRICT pixelbase;
2305         const unsigned char * RESTRICT pixel[4];
2306         int tciwrapmask[2];
2307         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2308         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2309         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2310         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2311         {
2312                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2313                 {
2314                         unsigned int tc[2] = { x * (texture->mipmap[mip][2]<<12) - 2048, y * (texture->mipmap[mip][3]<<12) - 2048};
2315                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2316                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2317                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2318                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
2319                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2320                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[mip][2]-1 ? tci[0] : texture->mipmap[mip][2]-1) : 0;
2321                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[mip][3]-1 ? tci[1] : texture->mipmap[mip][3]-1) : 0;
2322                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[mip][2]-1 ? tci1[0] : texture->mipmap[mip][2]-1) : 0;
2323                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[mip][3]-1 ? tci1[1] : texture->mipmap[mip][3]-1) : 0;
2324                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2325                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci1[0]);
2326                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci[0]);
2327                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci1[0]);
2328                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF00000);
2329                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF00000);
2330                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF00000);
2331                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF00000);
2332                 }
2333                 else
2334                 {
2335                         unsigned int tc[2] = { x * (texture->mipmap[mip][2]<<12) - 2048, y * (texture->mipmap[mip][3]<<12) - 2048};
2336                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2337                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2338                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2339                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
2340                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2341                         tci[0] &= tciwrapmask[0];
2342                         tci[1] &= tciwrapmask[1];
2343                         tci1[0] &= tciwrapmask[0];
2344                         tci1[1] &= tciwrapmask[1];
2345                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2346                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci1[0]);
2347                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci[0]);
2348                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci1[0]);
2349                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF00000);
2350                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF00000);
2351                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF00000);
2352                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF00000);
2353                 }
2354         }
2355         else
2356         {
2357                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2358                 {
2359                         int tci[2] = { x * texture->mipmap[mip][2], y * texture->mipmap[mip][3] };
2360                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[mip][2]-1 ? tci[0] : texture->mipmap[mip][2]-1) : 0;
2361                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[mip][3]-1 ? tci[1] : texture->mipmap[mip][3]-1) : 0;
2362                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2363                         c[0] = pixel[0][0] * (1.0f / 255.0f);
2364                         c[1] = pixel[0][1] * (1.0f / 255.0f);
2365                         c[2] = pixel[0][2] * (1.0f / 255.0f);
2366                         c[3] = pixel[0][3] * (1.0f / 255.0f);
2367                 }
2368                 else
2369                 {
2370                         int tci[2] = { x * texture->mipmap[mip][2], y * texture->mipmap[mip][3] };
2371                         tci[0] &= tciwrapmask[0];
2372                         tci[1] &= tciwrapmask[1];
2373                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2374                         c[0] = pixel[0][0] * (1.0f / 255.0f);
2375                         c[1] = pixel[0][1] * (1.0f / 255.0f);
2376                         c[2] = pixel[0][2] * (1.0f / 255.0f);
2377                         c[3] = pixel[0][3] * (1.0f / 255.0f);
2378                 }
2379         }
2380 }
2381
2382 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2383         // warning: this is SLOW, only use if the optimized per-span functions won't do
2384 {
2385         const unsigned char * RESTRICT pixelbase;
2386         const unsigned char * RESTRICT pixel[4];
2387         int tciwrapmask[2];
2388         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2389         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2390         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2391         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2392         {
2393                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2394                 {
2395                         unsigned int tc[2] = { x * (texture->mipmap[mip][2]<<12) - 2048, y * (texture->mipmap[mip][3]<<12) - 2048};
2396                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2397                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2398                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2399                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
2400                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2401                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[mip][2]-1 ? tci[0] : texture->mipmap[mip][2]-1) : 0;
2402                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[mip][3]-1 ? tci[1] : texture->mipmap[mip][3]-1) : 0;
2403                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[mip][2]-1 ? tci1[0] : texture->mipmap[mip][2]-1) : 0;
2404                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[mip][3]-1 ? tci1[1] : texture->mipmap[mip][3]-1) : 0;
2405                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2406                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci1[0]);
2407                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci[0]);
2408                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci1[0]);
2409                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2410                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2411                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2412                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2413                 }
2414                 else
2415                 {
2416                         unsigned int tc[2] = { x * (texture->mipmap[mip][2]<<12) - 2048, y * (texture->mipmap[mip][3]<<12) - 2048};
2417                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2418                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2419                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2420                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
2421                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2422                         tci[0] &= tciwrapmask[0];
2423                         tci[1] &= tciwrapmask[1];
2424                         tci1[0] &= tciwrapmask[0];
2425                         tci1[1] &= tciwrapmask[1];
2426                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2427                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci1[0]);
2428                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci[0]);
2429                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[mip][2]+tci1[0]);
2430                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2431                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2432                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2433                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2434                 }
2435         }
2436         else
2437         {
2438                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2439                 {
2440                         int tci[2] = { x * texture->mipmap[mip][2], y * texture->mipmap[mip][3] };
2441                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[mip][2]-1 ? tci[0] : texture->mipmap[mip][2]-1) : 0;
2442                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[mip][3]-1 ? tci[1] : texture->mipmap[mip][3]-1) : 0;
2443                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2444                         c[0] = pixel[0][0];
2445                         c[1] = pixel[0][1];
2446                         c[2] = pixel[0][2];
2447                         c[3] = pixel[0][3];
2448                 }
2449                 else
2450                 {
2451                         int tci[2] = { x * texture->mipmap[mip][2], y * texture->mipmap[mip][3] };
2452                         tci[0] &= tciwrapmask[0];
2453                         tci[1] &= tciwrapmask[1];
2454                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[mip][2]+tci[0]);
2455                         c[0] = pixel[0][0];
2456                         c[1] = pixel[0][1];
2457                         c[2] = pixel[0][2];
2458                         c[3] = pixel[0][3];
2459                 }
2460         }
2461 }
2462
2463 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2464 {
2465         int x;
2466         int startx = span->startx;
2467         int endx = span->endx;
2468         int flags;
2469         float c[4];
2470         float data[4];
2471         float slope[4];
2472         float tc[2], endtc[2];
2473         float tcscale[2];
2474         unsigned int tci[2];
2475         unsigned int tci1[2];
2476         unsigned int tcimin[2];
2477         unsigned int tcimax[2];
2478         int tciwrapmask[2];
2479         int tciwidth;
2480         int filter;
2481         int mip;
2482         const unsigned char * RESTRICT pixelbase;
2483         const unsigned char * RESTRICT pixel[4];
2484         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2485         // if no texture is bound, just fill it with white
2486         if (!texture)
2487         {
2488                 for (x = startx;x < endx;x++)
2489                 {
2490                         out4f[x*4+0] = 1.0f;
2491                         out4f[x*4+1] = 1.0f;
2492                         out4f[x*4+2] = 1.0f;
2493                         out4f[x*4+3] = 1.0f;
2494                 }
2495                 return;
2496         }
2497         mip = triangle->mip[texunitindex];
2498         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2499         // if this mipmap of the texture is 1 pixel, just fill it with that color
2500         if (texture->mipmap[mip][1] == 4)
2501         {
2502                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2503                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2504                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2505                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2506                 for (x = startx;x < endx;x++)
2507                 {
2508                         out4f[x*4+0] = c[0];
2509                         out4f[x*4+1] = c[1];
2510                         out4f[x*4+2] = c[2];
2511                         out4f[x*4+3] = c[3];
2512                 }
2513                 return;
2514         }
2515         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2516         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2517         flags = texture->flags;
2518         tcscale[0] = texture->mipmap[mip][2];
2519         tcscale[1] = texture->mipmap[mip][3];
2520         tciwidth = texture->mipmap[mip][2];
2521         tcimin[0] = 0;
2522         tcimin[1] = 0;
2523         tcimax[0] = texture->mipmap[mip][2]-1;
2524         tcimax[1] = texture->mipmap[mip][3]-1;
2525         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2526         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2527         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2528         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2529         if (filter)
2530         {
2531                 endtc[0] -= 0.5f;
2532                 endtc[1] -= 0.5f;
2533         }
2534         for (x = startx;x < endx;)
2535         {
2536                 unsigned int subtc[2];
2537                 unsigned int substep[2];
2538                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2539                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2540                 if (nextsub >= endx)
2541                 {
2542                         nextsub = endsub = endx-1;      
2543                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2544                 }
2545                 tc[0] = endtc[0];
2546                 tc[1] = endtc[1];
2547                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2548                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2549                 if (filter)
2550                 {
2551                         endtc[0] -= 0.5f;
2552                         endtc[1] -= 0.5f;
2553                 }
2554                 substep[0] = (endtc[0] - tc[0]) * subscale;
2555                 substep[1] = (endtc[1] - tc[1]) * subscale;
2556                 subtc[0] = tc[0] * (1<<12);
2557                 subtc[1] = tc[1] * (1<<12);
2558                 if (filter)
2559                 {
2560                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2561                         {
2562                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2563                                 {
2564                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2565                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2566                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2567                                         tci[0] = subtc[0]>>12;
2568                                         tci[1] = subtc[1]>>12;
2569                                         tci1[0] = tci[0] + 1;
2570                                         tci1[1] = tci[1] + 1;
2571                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2572                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2573                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2574                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2575                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2576                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2577                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2578                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2579                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2580                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2581                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2582                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2583                                         out4f[x*4+0] = c[0];
2584                                         out4f[x*4+1] = c[1];
2585                                         out4f[x*4+2] = c[2];
2586                                         out4f[x*4+3] = c[3];
2587                                 }
2588                         }
2589                         else
2590                         {
2591                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2592                                 {
2593                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2594                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2595                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2596                                         tci[0] = subtc[0]>>12;
2597                                         tci[1] = subtc[1]>>12;
2598                                         tci1[0] = tci[0] + 1;
2599                                         tci1[1] = tci[1] + 1;
2600                                         tci[0] &= tciwrapmask[0];
2601                                         tci[1] &= tciwrapmask[1];
2602                                         tci1[0] &= tciwrapmask[0];
2603                                         tci1[1] &= tciwrapmask[1];
2604                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2605                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2606                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2607                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2608                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2609                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2610                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2611                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2612                                         out4f[x*4+0] = c[0];
2613                                         out4f[x*4+1] = c[1];
2614                                         out4f[x*4+2] = c[2];
2615                                         out4f[x*4+3] = c[3];
2616                                 }
2617                         }
2618                 }
2619                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2620                 {
2621                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2622                         {
2623                                 tci[0] = subtc[0]>>12;
2624                                 tci[1] = subtc[1]>>12;
2625                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2626                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2627                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2628                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2629                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2630                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2631                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2632                                 out4f[x*4+0] = c[0];
2633                                 out4f[x*4+1] = c[1];
2634                                 out4f[x*4+2] = c[2];
2635                                 out4f[x*4+3] = c[3];
2636                         }
2637                 }
2638                 else
2639                 {
2640                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2641                         {
2642                                 tci[0] = subtc[0]>>12;
2643                                 tci[1] = subtc[1]>>12;
2644                                 tci[0] &= tciwrapmask[0];
2645                                 tci[1] &= tciwrapmask[1];
2646                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2647                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2648                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2649                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2650                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2651                                 out4f[x*4+0] = c[0];
2652                                 out4f[x*4+1] = c[1];
2653                                 out4f[x*4+2] = c[2];
2654                                 out4f[x*4+3] = c[3];
2655                         }
2656                 }
2657         }
2658 }
2659
2660 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2661 {
2662 #ifdef SSE_POSSIBLE
2663         int x;
2664         int startx = span->startx;
2665         int endx = span->endx;
2666         int flags;
2667         __m128 data, slope, tcscale;
2668         __m128i tcsize, tcmask, tcoffset, tcmax;
2669         __m128 tc, endtc;
2670         __m128i subtc, substep, endsubtc;
2671         int filter;
2672         int mip;
2673         int affine; // LordHavoc: optimized affine texturing case
2674         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2675         const unsigned char * RESTRICT pixelbase;
2676         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2677         // if no texture is bound, just fill it with white
2678         if (!texture)
2679         {
2680                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2681                 return;
2682         }
2683         mip = triangle->mip[texunitindex];
2684         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2685         // if this mipmap of the texture is 1 pixel, just fill it with that color
2686         if (texture->mipmap[mip][1] == 4)
2687         {
2688                 unsigned int k = *((const unsigned int *)pixelbase);
2689                 for (x = startx;x < endx;x++)
2690                         outi[x] = k;
2691                 return;
2692         }
2693         affine = zf[startx] == zf[endx-1];
2694         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2695         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2696         flags = texture->flags;
2697         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2698         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2699         tcscale = _mm_cvtepi32_ps(tcsize);
2700         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2701         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2702         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2703         if (filter)
2704                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2705         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2706         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2707         tcmax = _mm_packs_epi32(tcmask, tcmask);
2708         for (x = startx;x < endx;)
2709         {
2710                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2711                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2712                 if (nextsub >= endx || affine)
2713                 {
2714                         nextsub = endsub = endx-1;
2715                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2716                 }       
2717                 tc = endtc;
2718                 subtc = endsubtc;
2719                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2720                 if (filter)
2721                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2722                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2723                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2724                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2725                 substep = _mm_slli_epi32(substep, 1);
2726                 if (filter)
2727                 {
2728                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2729                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2730                         {
2731                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2732                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2733                                 {
2734                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2735                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2736                                         tci = _mm_madd_epi16(tci, tcoffset);
2737                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2738                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2739                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2740                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2741                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2742                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2743                                         fracm = _mm_srli_epi16(subtc, 1);
2744                                         pix1 = _mm_add_epi16(pix1,
2745                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2746                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2747                                         pix3 = _mm_add_epi16(pix3,
2748                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2749                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2750                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2751                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2752                                         pix2 = _mm_add_epi16(pix2,
2753                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2754                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2755                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2756                                 }
2757                                 if (x <= endsub)
2758                                 {
2759                                         const unsigned char * RESTRICT ptr1;
2760                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2761                                         tci = _mm_madd_epi16(tci, tcoffset);
2762                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2763                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2764                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2765                                         fracm = _mm_srli_epi16(subtc, 1);
2766                                         pix1 = _mm_add_epi16(pix1,
2767                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2768                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2769                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2770                                         pix1 = _mm_add_epi16(pix1,
2771                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2772                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2773                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2774                                         x++;
2775                                 }
2776                         }
2777                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2778                         {
2779                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2780                                 {
2781                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2782                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2783                                         tci = _mm_madd_epi16(tci, tcoffset);
2784                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2785                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2786                                                                                         _mm_setzero_si128());
2787                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2788                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2789                                                                                         _mm_setzero_si128());
2790                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2791                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2792                                         tci = _mm_madd_epi16(tci, tcoffset);
2793                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2794                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2795                                                                                         _mm_setzero_si128());
2796                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2797                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2798                                                                                         _mm_setzero_si128());
2799                                         fracm = _mm_srli_epi16(subtc, 1);
2800                                         pix1 = _mm_add_epi16(pix1,
2801                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2802                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2803                                         pix3 = _mm_add_epi16(pix3,
2804                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2805                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2806                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2807                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2808                                         pix2 = _mm_add_epi16(pix2,
2809                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2810                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2811                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2812                                 }
2813                                 if (x <= endsub)
2814                                 {
2815                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2816                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2817                                         tci = _mm_madd_epi16(tci, tcoffset);
2818                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2819                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2820                                                                                         _mm_setzero_si128());
2821                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2822                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2823                                                                                         _mm_setzero_si128());
2824                                         fracm = _mm_srli_epi16(subtc, 1);
2825                                         pix1 = _mm_add_epi16(pix1,
2826                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2827                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2828                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2829                                         pix1 = _mm_add_epi16(pix1,
2830                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2831                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2832                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2833                                         x++;
2834                                 }
2835                         }
2836                         else
2837                         {
2838                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2839                                 {
2840                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2841                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2842                                         tci = _mm_madd_epi16(tci, tcoffset);
2843                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2844                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2845                                                                                         _mm_setzero_si128());
2846                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2847                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2848                                                                                         _mm_setzero_si128());
2849                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2850                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2851                                         tci = _mm_madd_epi16(tci, tcoffset);
2852                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2853                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2854                                                                                         _mm_setzero_si128());
2855                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2856                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2857                                                                                         _mm_setzero_si128());
2858                                         fracm = _mm_srli_epi16(subtc, 1);
2859                                         pix1 = _mm_add_epi16(pix1,
2860                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2861                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2862                                         pix3 = _mm_add_epi16(pix3,
2863                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2864                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2865                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2866                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2867                                         pix2 = _mm_add_epi16(pix2,
2868                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2869                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2870                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2871                                 }
2872                                 if (x <= endsub)
2873                                 {
2874                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2875                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2876                                         tci = _mm_madd_epi16(tci, tcoffset);
2877                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2878                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2879                                                                                         _mm_setzero_si128());
2880                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2881                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2882                                                                                         _mm_setzero_si128());
2883                                         fracm = _mm_srli_epi16(subtc, 1);
2884                                         pix1 = _mm_add_epi16(pix1,
2885                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2886                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2887                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2888                                         pix1 = _mm_add_epi16(pix1,
2889                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2890                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2891                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2892                                         x++;
2893                                 }
2894                         }
2895                 }
2896                 else
2897                 {
2898                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2899                         {
2900                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2901                                 {
2902                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2903                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2904                                         tci = _mm_madd_epi16(tci, tcoffset);
2905                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2906                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2907                                 }
2908                                 if (x <= endsub)
2909                                 {
2910                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2911                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2912                                         tci = _mm_madd_epi16(tci, tcoffset);
2913                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2914                                         x++;
2915                                 }
2916                         }
2917                         else
2918                         {
2919                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2920                                 {
2921                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2922                                         tci = _mm_and_si128(tci, tcmax); 
2923                                         tci = _mm_madd_epi16(tci, tcoffset);
2924                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2925                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2926                                 }
2927                                 if (x <= endsub)
2928                                 {
2929                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2930                                         tci = _mm_and_si128(tci, tcmax); 
2931                                         tci = _mm_madd_epi16(tci, tcoffset);
2932                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2933                                         x++;
2934                                 }
2935                         }
2936                 }
2937         }
2938 #endif
2939 }
2940
2941 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2942 {
2943         // TODO: IMPLEMENT
2944         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2945 }
2946
2947 float DPSOFTRAST_SampleShadowmap(const float *vector)
2948 {
2949         // TODO: IMPLEMENT
2950         return 1.0f;
2951 }
2952
2953 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2954 {
2955         int x;
2956         int startx = span->startx;
2957         int endx = span->endx;
2958         float c[4];
2959         float data[4];
2960         float slope[4];
2961         float z;
2962         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2963         for (x = startx;x < endx;x++)
2964         {
2965                 z = zf[x];
2966                 c[0] = (data[0] + slope[0]*x) * z;
2967                 c[1] = (data[1] + slope[1]*x) * z;
2968                 c[2] = (data[2] + slope[2]*x) * z;
2969                 c[3] = (data[3] + slope[3]*x) * z;
2970                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2971                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2972                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2973                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2974         }
2975 }
2976
2977 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2978 {
2979         int x;
2980         int startx = span->startx;
2981         int endx = span->endx;
2982         float c[4];
2983         float data[4];
2984         float slope[4];
2985         float z;
2986         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2987         for (x = startx;x < endx;x++)
2988         {
2989                 z = zf[x];
2990                 c[0] = (data[0] + slope[0]*x) * z;
2991                 c[1] = (data[1] + slope[1]*x) * z;
2992                 c[2] = (data[2] + slope[2]*x) * z;
2993                 c[3] = (data[3] + slope[3]*x) * z;
2994                 out4f[x*4+0] = c[0];
2995                 out4f[x*4+1] = c[1];
2996                 out4f[x*4+2] = c[2];
2997                 out4f[x*4+3] = c[3];
2998         }
2999 }
3000
3001 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
3002 {
3003         int x, startx = span->startx, endx = span->endx;
3004         float c[4], localcolor[4];
3005         localcolor[0] = subcolor[0];
3006         localcolor[1] = subcolor[1];
3007         localcolor[2] = subcolor[2];
3008         localcolor[3] = subcolor[3];
3009         for (x = startx;x < endx;x++)
3010         {
3011                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
3012                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
3013                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
3014                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
3015                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
3016                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
3017                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
3018                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
3019         }
3020 }
3021
3022 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
3023 {
3024         int x, startx = span->startx, endx = span->endx;
3025         for (x = startx;x < endx;x++)
3026         {
3027                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
3028                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
3029                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
3030                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
3031         }
3032 }
3033
3034 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
3035 {
3036         int x, startx = span->startx, endx = span->endx;
3037         for (x = startx;x < endx;x++)
3038         {
3039                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
3040                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
3041                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
3042                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
3043         }
3044 }
3045
3046 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
3047 {
3048         int x, startx = span->startx, endx = span->endx;
3049         float a, b;
3050         for (x = startx;x < endx;x++)
3051         {
3052                 a = 1.0f - inb4f[x*4+3];
3053                 b = inb4f[x*4+3];
3054                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
3055                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
3056                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
3057                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
3058         }
3059 }
3060
3061 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
3062 {
3063         int x, startx = span->startx, endx = span->endx;
3064         float localcolor[4], ilerp, lerp;
3065         localcolor[0] = color[0];
3066         localcolor[1] = color[1];
3067         localcolor[2] = color[2];
3068         localcolor[3] = color[3];
3069         ilerp = 1.0f - localcolor[3];
3070         lerp = localcolor[3];
3071         for (x = startx;x < endx;x++)
3072         {
3073                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
3074                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
3075                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
3076                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
3077         }
3078 }
3079
3080
3081
3082 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
3083 {
3084 #ifdef SSE_POSSIBLE
3085         int x;
3086         int startx = span->startx;
3087         int endx = span->endx;
3088         __m128 data, slope;
3089         __m128 mod, endmod;
3090         __m128i submod, substep, endsubmod;
3091         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3092         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3093         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3094         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3095         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3096         for (x = startx; x < endx;)
3097         {
3098                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3099                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3100                 if (nextsub >= endx)
3101                 {
3102                         nextsub = endsub = endx-1;
3103                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3104                 }
3105                 mod = endmod;
3106                 submod = endsubmod;
3107                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3108                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3109                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3110                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3111                 substep = _mm_packs_epi32(substep, substep);
3112                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3113                 {
3114                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3115                         pix = _mm_mulhi_epu16(pix, submod);
3116                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3117                 }
3118                 if (x <= endsub)
3119                 {
3120                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3121                         pix = _mm_mulhi_epu16(pix, submod);
3122                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3123                         x++;
3124                 }
3125         }
3126 #endif
3127 }
3128
3129 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3130 {
3131 #ifdef SSE_POSSIBLE
3132         int x;
3133         int startx = span->startx;
3134         int endx = span->endx;
3135         __m128 data, slope;
3136         __m128 mod, endmod;
3137         __m128i submod, substep, endsubmod;
3138         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3139         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3140         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3141         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3142         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3143         for (x = startx; x < endx;)
3144         {
3145                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3146                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3147                 if (nextsub >= endx)
3148                 {
3149                         nextsub = endsub = endx-1;
3150                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3151                 }
3152                 mod = endmod;
3153                 submod = endsubmod;
3154                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3155                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3156                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3157                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3158                 substep = _mm_packs_epi32(substep, substep);
3159                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3160                 {
3161                         __m128i pix = _mm_srai_epi16(submod, 4);
3162                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3163                 }
3164                 if (x <= endsub)
3165                 {
3166                         __m128i pix = _mm_srai_epi16(submod, 4);
3167                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3168                         x++;
3169                 }
3170         }
3171 #endif
3172 }
3173
3174 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3175 {
3176 #ifdef SSE_POSSIBLE
3177         int x, startx = span->startx, endx = span->endx;
3178         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3179         localcolor = _mm_packs_epi32(localcolor, localcolor);
3180         for (x = startx;x+2 <= endx;x+=2)
3181         {
3182                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3183                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3184                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3185                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3186         }
3187         if (x < endx)
3188         {
3189                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3190                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3191                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3192                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3193         }
3194 #endif
3195 }
3196
3197 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3198 {
3199 #ifdef SSE_POSSIBLE
3200         int x, startx = span->startx, endx = span->endx;
3201         for (x = startx;x+2 <= endx;x+=2)
3202         {
3203                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3204                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3205                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3206                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3207         }
3208         if (x < endx)
3209         {
3210                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3211                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3212                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3213                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3214         }
3215 #endif
3216 }
3217
3218 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3219 {
3220 #ifdef SSE_POSSIBLE
3221         int x, startx = span->startx, endx = span->endx;
3222         for (x = startx;x+2 <= endx;x+=2)
3223         {
3224                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3225                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3226                 pix1 = _mm_add_epi16(pix1, pix2);
3227                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3228         }
3229         if (x < endx)
3230         {
3231                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3232                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3233                 pix1 = _mm_add_epi16(pix1, pix2);
3234                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3235         }
3236 #endif
3237 }
3238
3239 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3240 {
3241 #ifdef SSE_POSSIBLE
3242         int x, startx = span->startx, endx = span->endx;
3243         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3244         tint = _mm_packs_epi32(tint, tint);
3245         for (x = startx;x+2 <= endx;x+=2)
3246         {
3247                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3248                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3249                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3250                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3251         }
3252         if (x < endx)
3253         {
3254                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3255                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3256                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3257                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3258         }
3259 #endif
3260 }
3261
3262 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3263 {
3264 #ifdef SSE_POSSIBLE
3265         int x, startx = span->startx, endx = span->endx;
3266         for (x = startx;x+2 <= endx;x+=2)
3267         {
3268                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3269                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3270                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3271                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3272                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3273         }
3274         if (x < endx)
3275         {
3276                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3277                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3278                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3279                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3280                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3281         }
3282 #endif
3283 }
3284
3285 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3286 {
3287 #ifdef SSE_POSSIBLE
3288         int x, startx = span->startx, endx = span->endx;
3289         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3290         localcolor = _mm_packs_epi32(localcolor, localcolor);
3291         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3292         for (x = startx;x+2 <= endx;x+=2)
3293         {
3294                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3295                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3296                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3297         }
3298         if (x < endx)
3299         {
3300                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3301                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3302                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3303         }
3304 #endif
3305 }
3306
3307
3308
3309 void DPSOFTRAST_VertexShader_Generic(void)
3310 {
3311         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3312         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3313         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3314         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3315                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3316 }
3317
3318 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3319 {
3320         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3321         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3322         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3323         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3324         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3325         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3326         {
3327                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3328                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3329                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3330                 {
3331                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3332                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3333                         {
3334                                 // multiply
3335                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3336                         }
3337                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3338                         {
3339                                 // add
3340                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3341                         }
3342                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3343                         {
3344                                 // alphablend
3345                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3346                         }
3347                 }
3348         }
3349         else
3350                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3351         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3352 }
3353
3354
3355
3356 void DPSOFTRAST_VertexShader_PostProcess(void)
3357 {
3358         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3359         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3360         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3361 }
3362
3363 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3364 {
3365         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3366         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3367         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3368         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3369         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3370         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3371         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3372         {
3373                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3374                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3375         }
3376         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3377         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3378         {
3379                 // TODO: implement saturation
3380         }
3381         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3382         {
3383                 // TODO: implement gammaramps
3384         }
3385         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3386 }
3387
3388
3389
3390 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3391 {
3392         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3393 }
3394
3395 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3396 {
3397         // this is never called (because colormask is off when this shader is used)
3398         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3399         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3400         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3401         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3402         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3403 }
3404
3405
3406
3407 void DPSOFTRAST_VertexShader_FlatColor(void)
3408 {
3409         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3410         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3411 }
3412
3413 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3414 {
3415 #ifdef SSE_POSSIBLE
3416         unsigned char * RESTRICT pixelmask = span->pixelmask;
3417         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3418         int x, startx = span->startx, endx = span->endx;
3419         __m128i Color_Ambientm;
3420         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3421         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3422         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3423         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3424         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3425         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3426                 pixel = buffer_FragColorbgra8;
3427         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3428         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3429         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3430         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3431         for (x = startx;x < endx;x++)
3432         {
3433                 __m128i color, pix;
3434                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3435                 {
3436                         __m128i pix2;
3437                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3438                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3439                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3440                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3441                         x += 3;
3442                         continue;
3443                 }
3444                 if (!pixelmask[x])
3445                         continue;
3446                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3447                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3448                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3449         }
3450         if (pixel == buffer_FragColorbgra8)
3451                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3452 #endif
3453 }
3454
3455
3456
3457 void DPSOFTRAST_VertexShader_VertexColor(void)
3458 {
3459         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3460         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3461         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3462 }
3463
3464 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3465 {
3466 #ifdef SSE_POSSIBLE
3467         unsigned char * RESTRICT pixelmask = span->pixelmask;
3468         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3469         int x, startx = span->startx, endx = span->endx;
3470         __m128i Color_Ambientm, Color_Diffusem;
3471         __m128 data, slope;
3472         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3473         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3474         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3475         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3476         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3477         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3478         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3479                 pixel = buffer_FragColorbgra8;
3480         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3481         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3482         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3483         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3484         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3485         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3486         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3487         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3488         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3489         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3490         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3491         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3492         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3493         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3494         {
3495                 __m128i color, mod, pix;
3496                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3497                 {
3498                         __m128i pix2, mod2;
3499                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3500                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3501                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3502                         data = _mm_add_ps(data, slope);
3503                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3504                         data = _mm_add_ps(data, slope);
3505                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3506                         data = _mm_add_ps(data, slope);
3507                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3508                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3509                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3510                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3511                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3512                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3513                         x += 3;
3514                         continue;
3515                 }
3516                 if (!pixelmask[x])
3517                         continue;
3518                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3519                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3520                 mod = _mm_packs_epi32(mod, mod);
3521                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3522                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3523         }
3524         if (pixel == buffer_FragColorbgra8)
3525                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3526 #endif
3527 }
3528
3529
3530
3531 void DPSOFTRAST_VertexShader_Lightmap(void)
3532 {
3533         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3534         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3535         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3536 }
3537
3538 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3539 {
3540 #ifdef SSE_POSSIBLE
3541         unsigned char * RESTRICT pixelmask = span->pixelmask;
3542         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3543         int x, startx = span->startx, endx = span->endx;
3544         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3545         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3546         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3547         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3548         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3549         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3550         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3551         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3552         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3553         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3554                 pixel = buffer_FragColorbgra8;
3555         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3556         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3557         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3558         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3559         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3560         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3561         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3562         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3563         {
3564                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3565                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3566                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3567                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3568                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3569                 for (x = startx;x < endx;x++)
3570                 {
3571                         __m128i color, lightmap, glow, pix;
3572                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3573                         {
3574                                 __m128i pix2;
3575                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3576                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3577                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3578                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3579                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3580                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3581                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3582                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3583                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3584                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3585                                 x += 3;
3586                                 continue;
3587                         }
3588                         if (!pixelmask[x])
3589                                 continue;
3590                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3591                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3592                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3593                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3594                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3595                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3596                 }
3597         }
3598         else
3599         {
3600                 for (x = startx;x < endx;x++)
3601                 {
3602                         __m128i color, lightmap, pix;
3603                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3604                         {
3605                                 __m128i pix2;
3606                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3607                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3608                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3609                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3610                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3611                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3612                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3613                                 x += 3;
3614                                 continue;
3615                         }
3616                         if (!pixelmask[x]) 
3617                                 continue;
3618                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3619                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3620                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3621                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3622                 }
3623         }
3624         if (pixel == buffer_FragColorbgra8)
3625                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3626 #endif
3627 }
3628
3629
3630 void DPSOFTRAST_VertexShader_LightDirection(void);
3631 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3632
3633 void DPSOFTRAST_VertexShader_FakeLight(void)
3634 {
3635         DPSOFTRAST_VertexShader_LightDirection();
3636 }
3637
3638 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3639 {
3640         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3641 }
3642
3643
3644
3645 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3646 {
3647         DPSOFTRAST_VertexShader_LightDirection();
3648         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3649 }
3650
3651 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3652 {
3653         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3654 }
3655
3656
3657
3658 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3659 {
3660         DPSOFTRAST_VertexShader_LightDirection();
3661         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3662 }
3663
3664 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3665 {
3666         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3667 }
3668
3669
3670
3671 void DPSOFTRAST_VertexShader_LightDirection(void)
3672 {
3673         int i;
3674         int numvertices = dpsoftrast.numvertices;
3675         float LightDir[4];
3676         float LightVector[4];
3677         float EyePosition[4];
3678         float EyeVectorModelSpace[4];
3679         float EyeVector[4];
3680         float position[4];
3681         float svector[4];
3682         float tvector[4];
3683         float normal[4];
3684         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3685         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3686         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3687         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3688         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3689         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3690         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3691         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3692         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3693         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3694         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3695         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3696         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3697         for (i = 0;i < numvertices;i++)
3698         {
3699                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3700                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3701                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3702                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3703                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3704                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3705                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3706                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3707                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3708                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3709                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3710                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3711                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3712                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3713                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3714                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3715                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3716                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3717                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3718                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3719                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3720                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3721                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3722                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3723                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3724                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3725                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3726                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3727                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3728         }
3729         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3730 }
3731
3732 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3733 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3734 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3735 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3736 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3737 #define DPSOFTRAST_Vector3Normalize(v)\
3738 do\
3739 {\
3740         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3741         if (len)\
3742         {\
3743                 len = 1.0f / len;\
3744                 v[0] *= len;\
3745                 v[1] *= len;\
3746                 v[2] *= len;\
3747         }\
3748 }\
3749 while(0)
3750
3751 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3752 {
3753         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3754         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3755         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3756         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3757         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3758         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3759         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3760         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3761         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3762         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3763         int x, startx = span->startx, endx = span->endx;
3764         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3765         float LightVectordata[4];
3766         float LightVectorslope[4];
3767         float EyeVectordata[4];
3768         float EyeVectorslope[4];
3769         float VectorSdata[4];
3770         float VectorSslope[4];
3771         float VectorTdata[4];
3772         float VectorTslope[4];
3773         float VectorRdata[4];
3774         float VectorRslope[4];
3775         float z;
3776         float diffusetex[4];
3777         float glosstex[4];
3778         float surfacenormal[4];
3779         float lightnormal[4];
3780         float lightnormal_modelspace[4];
3781         float eyenormal[4];
3782         float specularnormal[4];
3783         float diffuse;
3784         float specular;
3785         float SpecularPower;
3786         int d[4];
3787         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3788         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3789         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3790         Color_Glow[3] = 0.0f;
3791         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3792         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3793         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3794         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3795         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3796         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3797         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3798         Color_Pants[3] = 0.0f;
3799         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3800         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3801         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3802         Color_Shirt[3] = 0.0f;
3803         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3804         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3805         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3806         {
3807                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3808                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3809         }
3810         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3811         {
3812                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3813         }
3814         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3815         {
3816                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3817                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3818                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3819                 Color_Diffuse[3] = 0.0f;
3820                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3821                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3822                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3823                 LightColor[3] = 0.0f;
3824                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3825                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3826                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3827                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3828                 Color_Specular[3] = 0.0f;
3829                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3830                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3831                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3832
3833                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3834                 {
3835                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3836                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3837                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3838                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3839                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3840                 }
3841                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3842                 {
3843                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3844                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3845                 }
3846                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3847                 {
3848                         // nothing of this needed
3849                 }
3850                 else
3851                 {
3852                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3853                 }
3854
3855                 for (x = startx;x < endx;x++)
3856                 {
3857                         z = buffer_z[x];
3858                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3859                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3860                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3861                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3862                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3863                         {
3864                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3865                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3866                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3867                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3868                         }
3869                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3870                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3871                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3872                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3873                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3874                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3875                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3876                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3877
3878                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3879                         {
3880                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3881                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3882                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3883                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3884
3885                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3886                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3887                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3888                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3889
3890                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3891                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3892                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3893                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3894
3895                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3896                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3897                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3898                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3899
3900                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3901                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3902
3903                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3904                                 {
3905                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3906                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3907                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3908                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3909                                 }
3910                         }
3911                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3912                         {
3913                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3914                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3915                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3916                                 {
3917                                         float f = 1.0f / 256.0f;
3918                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3919                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3920                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3921                                 }
3922                         }
3923                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3924                         {
3925                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3926                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3927                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3928                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3929
3930                                 LightColor[0] = 1.0;
3931                                 LightColor[1] = 1.0;
3932                                 LightColor[2] = 1.0;
3933                         }
3934                         else
3935                         {
3936                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3937                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3938                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3939                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3940                         }
3941
3942                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3943
3944                         if(thread->shader_exactspecularmath)
3945                         {
3946                                 // reflect lightnormal at surfacenormal, take the negative of that
3947                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3948                                 float f;
3949                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3950                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3951                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3952                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3953
3954                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3955                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3956                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3957                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3958                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3959
3960                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3961                         }
3962                         else
3963                         {
3964                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3965                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3966                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3967                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3968
3969                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3970                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3971                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3972                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3973
3974                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3975                         }
3976
3977                         specular = pow(specular, SpecularPower * glosstex[3]);
3978                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3979                         {
3980                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3981                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3982                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3983                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3984                         }
3985                         else
3986                         {
3987                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3988                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3989                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3990                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3991                         }
3992
3993                         buffer_FragColorbgra8[x*4+0] = d[0];
3994                         buffer_FragColorbgra8[x*4+1] = d[1];
3995                         buffer_FragColorbgra8[x*4+2] = d[2];
3996                         buffer_FragColorbgra8[x*4+3] = d[3];
3997                 }
3998         }
3999         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4000         {
4001                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4002                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4003                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4004                 Color_Diffuse[3] = 0.0f;
4005                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4006                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4007                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4008                 LightColor[3] = 0.0f;
4009                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4010
4011                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
4012                 {
4013                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4014                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4015                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4016                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
4017                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
4018                 }
4019                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4020                 {
4021                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
4022                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
4023                 }
4024                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4025                 {
4026                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4027                 }
4028                 else
4029                 {
4030                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
4031                 }
4032
4033                 for (x = startx;x < endx;x++)
4034                 {
4035                         z = buffer_z[x];
4036                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4037                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4038                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4039                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4040                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4041                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4042                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4043                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4044
4045                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
4046                         {
4047                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
4048                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4049                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4050                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4051
4052                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
4053                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
4054                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
4055                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
4056
4057                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
4058                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
4059                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
4060                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
4061
4062                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
4063                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
4064                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
4065                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
4066
4067                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
4068                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4069
4070                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
4071                                 {
4072                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
4073                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4074                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4075                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4076                                 }
4077                         }
4078                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
4079                         {
4080                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4081                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4082                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4083                                 {
4084                                         float f = 1.0f / 256.0f;
4085                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
4086                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
4087                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
4088                                 }
4089                         }
4090                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4091                         {
4092                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4093                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4094                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4095                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4096
4097                                 LightColor[0] = 1.0;
4098                                 LightColor[1] = 1.0;
4099                                 LightColor[2] = 1.0;
4100                         }
4101                         else
4102                         {
4103                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4104                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4105                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4106                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4107                         }
4108
4109                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4110                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4111                         {
4112                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4113                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4114                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4115                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4116                         }
4117                         else
4118                         {
4119                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4120                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4121                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4122                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4123                         }
4124                         buffer_FragColorbgra8[x*4+0] = d[0];
4125                         buffer_FragColorbgra8[x*4+1] = d[1];
4126                         buffer_FragColorbgra8[x*4+2] = d[2];
4127                         buffer_FragColorbgra8[x*4+3] = d[3];
4128                 }
4129         }
4130         else
4131         {
4132                 for (x = startx;x < endx;x++)
4133                 {
4134                         z = buffer_z[x];
4135                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4136                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4137                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4138                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4139
4140                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4141                         {
4142                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4143                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4144                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4145                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4146                         }
4147                         else
4148                         {
4149                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4150                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4151                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4152                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4153                         }
4154                         buffer_FragColorbgra8[x*4+0] = d[0];
4155                         buffer_FragColorbgra8[x*4+1] = d[1];
4156                         buffer_FragColorbgra8[x*4+2] = d[2];
4157                         buffer_FragColorbgra8[x*4+3] = d[3];
4158                 }
4159         }
4160         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4161 }
4162
4163
4164
4165 void DPSOFTRAST_VertexShader_LightSource(void)
4166 {
4167         int i;
4168         int numvertices = dpsoftrast.numvertices;
4169         float LightPosition[4];
4170         float LightVector[4];
4171         float LightVectorModelSpace[4];
4172         float EyePosition[4];
4173         float EyeVectorModelSpace[4];
4174         float EyeVector[4];
4175         float position[4];
4176         float svector[4];
4177         float tvector[4];
4178         float normal[4];
4179         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4180         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4181         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4182         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4183         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4184         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4185         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4186         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4187         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4188         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4189         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4190         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4191         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4192         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4193         for (i = 0;i < numvertices;i++)
4194         {
4195                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4196                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4197                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4198                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4199                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4200                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4201                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4202                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4203                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4204                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4205                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4206                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4207                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4208                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4209                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4210                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4211                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4212                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4213                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4214                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4215                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4216                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4217                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4218                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4219                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4220                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4221                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4222                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4223                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4224                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4225                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4226                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4227         }
4228         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4229         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4230 }
4231
4232 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4233 {
4234 #ifdef SSE_POSSIBLE
4235         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4236         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4237         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4238         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4239         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4240         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4241         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4242         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4243         int x, startx = span->startx, endx = span->endx;
4244         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4245         float CubeVectordata[4];
4246         float CubeVectorslope[4];
4247         float LightVectordata[4];
4248         float LightVectorslope[4];
4249         float EyeVectordata[4];
4250         float EyeVectorslope[4];
4251         float z;
4252         float diffusetex[4];
4253         float glosstex[4];
4254         float surfacenormal[4];
4255         float lightnormal[4];
4256         float eyenormal[4];
4257         float specularnormal[4];
4258         float diffuse;
4259         float specular;
4260         float SpecularPower;
4261         float CubeVector[4];
4262         float attenuation;
4263         int d[4];
4264         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4265         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4266         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4267         Color_Glow[3] = 0.0f;
4268         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4269         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4270         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4271         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4272         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4273         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4274         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4275         Color_Diffuse[3] = 0.0f;
4276         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4277         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4278         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4279         Color_Specular[3] = 0.0f;
4280         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4281         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4282         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4283         Color_Pants[3] = 0.0f;
4284         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4285         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4286         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4287         Color_Shirt[3] = 0.0f;
4288         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4289         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4290         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4291         LightColor[3] = 0.0f;
4292         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4293         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4294         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4295         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4296         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4297         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4298         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4299         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4300         {
4301                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4302                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4303         }
4304         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4305                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4306         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4307         {
4308                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4309                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4310                 for (x = startx;x < endx;x++)
4311                 {
4312                         z = buffer_z[x];
4313                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4314                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4315                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4316                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4317                         if (attenuation < 0.01f)
4318                                 continue;
4319                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4320                         {
4321                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4322                                 if (attenuation < 0.01f)
4323                                         continue;
4324                         }
4325
4326                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4327                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4328                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4329                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4330                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4331                         {
4332                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4333                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4334                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4335                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4336                         }
4337                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4338                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4339                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4340                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4341                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4342                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4343                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4344                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4345
4346                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4347                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4348                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4349                         DPSOFTRAST_Vector3Normalize(lightnormal);
4350
4351                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4352
4353                         if(thread->shader_exactspecularmath)
4354                         {
4355                                 // reflect lightnormal at surfacenormal, take the negative of that
4356                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4357                                 float f;
4358                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4359                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4360                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4361                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4362
4363                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4364                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4365                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4366                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4367                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4368
4369                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4370                         }
4371                         else
4372                         {
4373                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4374                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4375                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4376                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4377
4378                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4379                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4380                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4381                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4382
4383                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4384                         }
4385                         specular = pow(specular, SpecularPower * glosstex[3]);
4386
4387                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4388                         {
4389                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4390                                 attenuation *= (1.0f / 255.0f);
4391                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4392                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4393                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4394                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4395                         }
4396                         else
4397                         {
4398                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4399                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4400                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4401                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4402                         }
4403                         buffer_FragColorbgra8[x*4+0] = d[0];
4404                         buffer_FragColorbgra8[x*4+1] = d[1];
4405                         buffer_FragColorbgra8[x*4+2] = d[2];
4406                         buffer_FragColorbgra8[x*4+3] = d[3];
4407                 }
4408         }
4409         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4410         {
4411                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4412                 for (x = startx;x < endx;x++)
4413                 {
4414                         z = buffer_z[x];
4415                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4416                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4417                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4418                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4419                         if (attenuation < 0.01f)
4420                                 continue;
4421                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4422                         {
4423                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4424                                 if (attenuation < 0.01f)
4425                                         continue;
4426                         }
4427
4428                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4429                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4430                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4431                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4432                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4433                         {
4434                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4435                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4436                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4437                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4438                         }
4439                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4440                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4441                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4442                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4443
4444                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4445                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4446                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4447                         DPSOFTRAST_Vector3Normalize(lightnormal);
4448
4449                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4450                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4451                         {
4452                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4453                                 attenuation *= (1.0f / 255.0f);
4454                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4455                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4456                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4457                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4458                         }
4459                         else
4460                         {
4461                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4462                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4463                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4464                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4465                         }
4466                         buffer_FragColorbgra8[x*4+0] = d[0];
4467                         buffer_FragColorbgra8[x*4+1] = d[1];
4468                         buffer_FragColorbgra8[x*4+2] = d[2];
4469                         buffer_FragColorbgra8[x*4+3] = d[3];
4470                 }
4471         }
4472         else
4473         {
4474                 for (x = startx;x < endx;x++)
4475                 {
4476                         z = buffer_z[x];
4477                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4478                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4479                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4480                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4481                         if (attenuation < 0.01f)
4482                                 continue;
4483                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4484                         {
4485                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4486                                 if (attenuation < 0.01f)
4487                                         continue;
4488                         }
4489
4490                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4491                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4492                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4493                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4494                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4495                         {
4496                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4497                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4498                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4499                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4500                         }
4501                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4502                         {
4503                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4504                                 attenuation *= (1.0f / 255.0f);
4505                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4506                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4507                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4508                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4509                         }
4510                         else
4511                         {
4512                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4513                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4514                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4515                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4516                         }
4517                         buffer_FragColorbgra8[x*4+0] = d[0];
4518                         buffer_FragColorbgra8[x*4+1] = d[1];
4519                         buffer_FragColorbgra8[x*4+2] = d[2];
4520                         buffer_FragColorbgra8[x*4+3] = d[3];
4521                 }
4522         }
4523         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4524 #endif
4525 }
4526
4527
4528
4529 void DPSOFTRAST_VertexShader_Refraction(void)
4530 {
4531         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4532         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4533         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4534 }
4535
4536 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4537 {
4538         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4539
4540         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4541         float z;
4542         int x, startx = span->startx, endx = span->endx;
4543
4544         // texture reads
4545         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4546         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4547
4548         // varyings
4549         float ModelViewProjectionPositiondata[4];
4550         float ModelViewProjectionPositionslope[4];
4551
4552         // uniforms
4553         float ScreenScaleRefractReflect[2];
4554         float ScreenCenterRefractReflect[2];
4555         float DistortScaleRefractReflect[2];
4556         float RefractColor[4];
4557
4558         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4559         if(!texture) return;
4560
4561         // read textures
4562         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4563         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4564
4565         // read varyings
4566         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4567
4568         // read uniforms
4569         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4570         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4571         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4572         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4573         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4574         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4575         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4576         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4577         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4578         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4579
4580         // do stuff
4581         for (x = startx;x < endx;x++)
4582         {
4583                 float SafeScreenTexCoord[2];
4584                 float ScreenTexCoord[2];
4585                 float v[3];
4586                 float iw;
4587                 unsigned char c[4];
4588
4589                 z = buffer_z[x];
4590
4591                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4592                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4593                 
4594                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4595                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4596                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4597
4598                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4599                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4600                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4601                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4602                 DPSOFTRAST_Vector3Normalize(v);
4603                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4604                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4605
4606                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4607                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4608
4609                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4610                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4611                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4612                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4613                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4614         }
4615
4616         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4617 }
4618
4619
4620
4621 void DPSOFTRAST_VertexShader_Water(void)
4622 {
4623         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4624 }
4625
4626
4627 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4628 {
4629         // TODO: IMPLEMENT
4630         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4631         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4632         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4633         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4634         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4635 }
4636
4637
4638
4639 void DPSOFTRAST_VertexShader_ShowDepth(void)
4640 {
4641         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4642 }
4643
4644 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4645 {
4646         // TODO: IMPLEMENT
4647         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4648         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4649         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4650         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4651         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4652 }
4653
4654
4655
4656 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4657 {
4658         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4659 }
4660
4661 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4662 {
4663         // TODO: IMPLEMENT
4664         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4665         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4666         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4667         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4668         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4669 }
4670
4671
4672
4673 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4674 {
4675         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4676 }
4677
4678 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4679 {
4680         // TODO: IMPLEMENT
4681         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4682         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4683         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4684         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4685         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4686 }
4687
4688
4689
4690 typedef struct DPSOFTRAST_ShaderModeInfo_s
4691 {
4692         int lodarrayindex;
4693         void (*Vertex)(void);
4694         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4695         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4696         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4697 }
4698 DPSOFTRAST_ShaderModeInfo;
4699
4700 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4701 {
4702         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4703         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4704         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4705         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4706         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4707         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4708         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4709         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4710         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4711         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4712         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4713         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4714         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4715         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4716         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4717         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4718 };
4719
4720 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4721 {
4722         int x;
4723         int startx;
4724         int endx;
4725         unsigned int *depthpixel;
4726         int depth;
4727         int depthslope;
4728         unsigned int d;
4729         unsigned char *pixelmask;
4730         DPSOFTRAST_State_Triangle *triangle;
4731         triangle = &thread->triangles[span->triangle];
4732         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4733         startx = span->startx;
4734         endx = span->endx;
4735         depth = span->depthbase;
4736         depthslope = span->depthslope;
4737         pixelmask = thread->pixelmaskarray;
4738         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4739         {
4740                 switch(thread->fb_depthfunc)
4741                 {
4742                 default:
4743                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4744                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4745                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4746                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4747                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4748                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4749                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4750                 }
4751                 while (startx < endx && !pixelmask[startx])
4752                         startx++;
4753                 while (endx > startx && !pixelmask[endx-1])
4754                         endx--;
4755         }
4756         else
4757         {
4758                 // no depth testing means we're just dealing with color...
4759                 memset(pixelmask + startx, 1, endx - startx);
4760         }
4761         span->pixelmask = pixelmask;
4762         span->startx = startx;
4763         span->endx = endx;
4764 }
4765
4766 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4767 {
4768         int x, d, depth, depthslope, startx, endx;
4769         const unsigned char *pixelmask;
4770         unsigned int *depthpixel;
4771         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4772         {
4773                 depth = span->depthbase;
4774                 depthslope = span->depthslope;
4775                 pixelmask = span->pixelmask;
4776                 startx = span->startx;
4777                 endx = span->endx;
4778                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4779                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4780                         if (pixelmask[x])
4781                                 depthpixel[x] = d;
4782         }
4783 }
4784
4785 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4786 {
4787         int i;
4788         DPSOFTRAST_State_Triangle *triangle;
4789         DPSOFTRAST_State_Span *span;
4790         for (i = 0; i < thread->numspans; i++)
4791         {
4792                 span = &thread->spans[i];
4793                 triangle = &thread->triangles[span->triangle];
4794                 DPSOFTRAST_Draw_DepthTest(thread, span);
4795                 if (span->startx >= span->endx)
4796                         continue;
4797                 // run pixel shader if appropriate
4798                 // do this before running depthmask code, to allow the pixelshader
4799                 // to clear pixelmask values for alpha testing
4800                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4801                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4802                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4803         }
4804         thread->numspans = 0;
4805 }
4806
4807 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4808
4809 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4810 {
4811 #ifdef SSE_POSSIBLE
4812         int cullface = thread->cullface;
4813         int minx, maxx, miny, maxy;
4814         int miny1, maxy1, miny2, maxy2;
4815         __m128i fbmin, fbmax;
4816         __m128 viewportcenter, viewportscale;
4817         int firstvertex = command->firstvertex;
4818         int numvertices = command->numvertices;
4819         int numtriangles = command->numtriangles;
4820         const int *element3i = command->element3i;
4821         const unsigned short *element3s = command->element3s;
4822         int clipped = command->clipped;
4823         int i;
4824         int j;
4825         int k;
4826         int y;
4827         int e[3];
4828         __m128i screeny;
4829         int starty, endy, bandy;
4830         int numpoints;
4831         int clipcase;
4832         float clipdist[4];
4833         float clip0origin, clip0slope;
4834         int clip0dir;
4835         __m128 triangleedge1, triangleedge2, trianglenormal;
4836         __m128 clipfrac[3];
4837         __m128 screen[4];
4838         DPSOFTRAST_State_Triangle *triangle;
4839         DPSOFTRAST_Texture *texture;
4840         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4841         miny = thread->fb_scissor[1];
4842         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4843         miny1 = bound(miny, thread->miny1, maxy);
4844         maxy1 = bound(miny, thread->maxy1, maxy);
4845         miny2 = bound(miny, thread->miny2, maxy);
4846         maxy2 = bound(miny, thread->maxy2, maxy);
4847         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4848         {
4849                 if (!ATOMIC_DECREMENT(command->refcount))
4850                 {
4851                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4852                                 MM_FREE(command->arrays);
4853                 }
4854                 return;
4855         }
4856         minx = thread->fb_scissor[0];
4857         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4858         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4859         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4860         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4861         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4862         screen[3] = _mm_setzero_ps();
4863         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4864         for (i = 0;i < numtriangles;i++)
4865         {
4866                 const float *screencoord4f = command->arrays;
4867                 const float *arrays = screencoord4f + numvertices*4;
4868
4869                 // generate the 3 edges of this triangle
4870                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4871                 if (element3s)
4872                 {
4873                         e[0] = element3s[i*3+0] - firstvertex;
4874                         e[1] = element3s[i*3+1] - firstvertex;
4875                         e[2] = element3s[i*3+2] - firstvertex;
4876                 }
4877                 else if (element3i)
4878                 {
4879                         e[0] = element3i[i*3+0] - firstvertex;
4880                         e[1] = element3i[i*3+1] - firstvertex;
4881                         e[2] = element3i[i*3+2] - firstvertex;
4882                 }
4883                 else
4884                 {
4885                         e[0] = i*3+0;
4886                         e[1] = i*3+1;
4887                         e[2] = i*3+2;
4888                 }
4889
4890 #define SKIPBACKFACE \
4891                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4892                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4893                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4894                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4895                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4896                 switch(cullface) \
4897                 { \
4898                 case GL_BACK: \
4899                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4900                                 continue; \
4901                         break; \
4902                 case GL_FRONT: \
4903                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4904                                 continue; \
4905                         break; \
4906                 }
4907
4908 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4909                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4910                         { \
4911                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4912                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4913                         }
4914 #define CLIPPEDVERTEXCOPY(k,p1) \
4915                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4916
4917 #define GENATTRIBCOPY(attrib, p1) \
4918                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4919 #define GENATTRIBLERP(attrib, p1, p2) \
4920                 { \
4921                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4922                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4923                 }
4924 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4925                 switch(clipcase) \
4926                 { \
4927                 default: \
4928                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4929                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4930                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4931                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4932                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4933                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4934                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4935                 }
4936
4937                 if (! clipped)
4938                         goto notclipped;
4939
4940                 // calculate distance from nearplane
4941                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4942                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4943                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4944                 if (clipdist[0] >= 0.0f)
4945                 {
4946                         if (clipdist[1] >= 0.0f)
4947                         {
4948                                 if (clipdist[2] >= 0.0f)
4949                                 {
4950                                 notclipped:
4951                                         // triangle is entirely in front of nearplane
4952                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4953                                         SKIPBACKFACE;
4954                                         numpoints = 3;
4955                                         clipcase = 0;
4956                                 }
4957                                 else
4958                                 {
4959                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4960                                         SKIPBACKFACE;
4961                                         numpoints = 4;
4962                                         clipcase = 1;
4963                                 }
4964                         }
4965                         else
4966                         {
4967                                 if (clipdist[2] >= 0.0f)
4968                                 {
4969                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4970                                         SKIPBACKFACE;
4971                                         numpoints = 4;
4972                                         clipcase = 2;
4973                                 }
4974                                 else
4975                                 {
4976                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4977                                         SKIPBACKFACE;
4978                                         numpoints = 3;
4979                                         clipcase = 3;
4980                                 }
4981                         }
4982                 }
4983                 else if (clipdist[1] >= 0.0f)
4984                 {
4985                         if (clipdist[2] >= 0.0f)
4986                         {
4987                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4988                                 SKIPBACKFACE;
4989                                 numpoints = 4;
4990                                 clipcase = 4;
4991                         }
4992                         else
4993                         {
4994                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4995                                 SKIPBACKFACE;
4996                                 numpoints = 3;
4997                                 clipcase = 5;
4998                         }
4999                 }
5000                 else if (clipdist[2] >= 0.0f)
5001                 {
5002                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5003                         SKIPBACKFACE;
5004                         numpoints = 3;
5005                         clipcase = 6;
5006                 }
5007                 else continue; // triangle is entirely behind nearplane
5008
5009                 {
5010                         // calculate integer y coords for triangle points
5011                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5012                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5013                                         screenmin = _mm_min_epi16(screeni, screenir),
5014                                         screenmax = _mm_max_epi16(screeni, screenir);
5015                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5016                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5017                         screenmin = _mm_max_epi16(screenmin, fbmin);
5018                         screenmax = _mm_min_epi16(screenmax, fbmax);
5019                         // skip offscreen triangles
5020                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5021                                 continue;
5022                         starty = _mm_extract_epi16(screenmin, 1);
5023                         endy = _mm_extract_epi16(screenmax, 1)+1;
5024                         if (starty >= maxy1 && endy <= miny2)
5025                                 continue;
5026                         screeny = _mm_srai_epi32(screeni, 16);
5027                 }
5028
5029                 triangle = &thread->triangles[thread->numtriangles];
5030
5031                 // calculate attribute plans for triangle data...
5032                 // okay, this triangle is going to produce spans, we'd better project
5033                 // the interpolants now (this is what gives perspective texturing),
5034                 // this consists of simply multiplying all arrays by the W coord
5035                 // (which is basically 1/Z), which will be undone per-pixel
5036                 // (multiplying by Z again) to get the perspective-correct array
5037                 // values
5038                 {
5039                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5040                         __m128 mipedgescale, mipdensity;
5041                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5042                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5043                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5044                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5045                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5046                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5047                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5048                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5049                         attribedge1 = _mm_sub_ss(w0, w1);
5050                         attribedge2 = _mm_sub_ss(w2, w1);
5051                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5052                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5053                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5054                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5055                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5056                         _mm_store_ss(&triangle->w[0], attribxslope);
5057                         _mm_store_ss(&triangle->w[1], attribyslope);
5058                         _mm_store_ss(&triangle->w[2], attriborigin);
5059                         
5060                         clip0origin = 0;
5061                         clip0slope = 0;
5062                         clip0dir = 0;
5063                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5064                         {
5065                                 float cliporigin, clipxslope, clipyslope;
5066                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5067                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5068                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5069                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5070                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5071                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5072                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5073                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5074                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5075                                 if(clipxslope != 0)
5076                                 {
5077                                         clip0origin = -cliporigin/clipxslope;
5078                                         clip0slope = -clipyslope/clipxslope;
5079                                         clip0dir = clipxslope > 0 ? 1 : -1;
5080                                 }
5081                                 else if(clipyslope > 0)
5082                                 {
5083                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5084                                         clip0slope = dpsoftrast.fb_width;
5085                                         clip0dir = -1;
5086                                 }
5087                                 else if(clipyslope < 0)
5088                                 {
5089                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5090                                         clip0slope = -dpsoftrast.fb_width;
5091                                         clip0dir = -1;
5092                                 }
5093                                 else if(clip0origin < 0) continue;
5094                         }
5095
5096                         mipedgescale = _mm_setzero_ps();
5097                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5098                         {
5099                                 __m128 attrib0, attrib1, attrib2;
5100                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5101                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5102                                         break;
5103                                 arrays += numvertices*4;
5104                                 GENATTRIBS(attrib0, attrib1, attrib2);
5105                                 attriborigin = _mm_mul_ps(attrib1, w1);
5106                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5107                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5108                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5109                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5110                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5111                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5112                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5113                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5114                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5115                                 {
5116                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5117                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5118                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5119                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5120                                 }
5121                         }
5122
5123                         memset(triangle->mip, 0, sizeof(triangle->mip));
5124                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5125                         {
5126                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5127                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5128                                         break;
5129                                 texture = thread->texbound[texunit];
5130                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5131                                 {
5132                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5133                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5134                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5135                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5136                                         // this will be multiplied in the texturing routine by the texture resolution
5137                                         y = _mm_cvtss_si32(mipdensity);
5138                                         if (y > 0)
5139                                         {
5140                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5141                                                 if (y > texture->mipmaps - 1)
5142                                                         y = texture->mipmaps - 1;
5143                                                 triangle->mip[texunit] = y;
5144                                         }
5145                                 }
5146                         }
5147                 }
5148         
5149                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5150                 for (; y < bandy;)
5151                 {
5152                         __m128 xcoords, xslope;
5153                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5154                         int yccmask = _mm_movemask_epi8(ycc);
5155                         int edge0p, edge0n, edge1p, edge1n;
5156                         int nexty;
5157                         float w, wslope;
5158                         float clip0;
5159                         if (numpoints == 4)
5160                         {
5161                                 switch(yccmask)
5162                                 {
5163                                 default:
5164                                 case 0xFFFF: /*0000*/ y = endy; continue;
5165                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5166                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5167                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5168                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5169                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5170                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5171                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5172                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5173                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5174                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5175                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5176                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5177                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5178                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5179                                 case 0x0000: /*1111*/ y++; continue;
5180                                 }
5181                         }
5182                         else
5183                         {
5184                                 switch(yccmask)
5185                                 {
5186                                 default:
5187                                 case 0xFFFF: /*000*/ y = endy; continue;
5188                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5189                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5190                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5191                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5192                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5193                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5194                                 case 0x0000: /*111*/ y++; continue;
5195                                 }
5196                         }
5197                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5198                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5199                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5200                         nexty = _mm_extract_epi16(ycc, 0);
5201                         if (nexty >= bandy) nexty = bandy-1;
5202                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5203                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5204                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5205                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5206                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5207                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5208                         {
5209                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5210                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5211                         }
5212                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5213                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5214                         {
5215                                 int startx, endx, offset;
5216                                 startx = _mm_cvtss_si32(xcoords);
5217                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5218                                 if (startx < minx) startx = minx;
5219                                 if (endx > maxx) endx = maxx;
5220                                 if (startx >= endx) continue;
5221
5222                                 if (clip0dir)
5223                                 {
5224                                         if (clip0dir > 0)
5225                                         {
5226                                                 if (startx < clip0) 
5227                                                 {
5228                                                         if(endx <= clip0) continue;
5229                                                         startx = (int)clip0;
5230                                                 }
5231                                         }
5232                                         else if (endx > clip0) 
5233                                         {
5234                                                 if(startx >= clip0) continue;
5235                                                 endx = (int)clip0;
5236                                         }
5237                                 }
5238                                                 
5239                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5240                                 {
5241                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5242                                         span->triangle = thread->numtriangles;
5243                                         span->x = offset;
5244                                         span->y = y;
5245                                         span->startx = 0;
5246                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5247                                         if (span->startx >= span->endx)
5248                                                 continue;
5249                                         wslope = triangle->w[0];
5250                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5251                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5252                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5253                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5254                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5255                                 }
5256                         }
5257                 }
5258
5259                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5260                 {
5261                         DPSOFTRAST_Draw_ProcessSpans(thread);
5262                         thread->numtriangles = 0;
5263                 }
5264         }
5265
5266         if (!ATOMIC_DECREMENT(command->refcount))
5267         {
5268                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5269                         MM_FREE(command->arrays);
5270         }
5271
5272         if (thread->numspans > 0 || thread->numtriangles > 0)
5273         {
5274                 DPSOFTRAST_Draw_ProcessSpans(thread);
5275                 thread->numtriangles = 0;
5276         }
5277 #endif
5278 }
5279
5280 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5281 {
5282         int i;
5283         int j;
5284         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5285         int datasize = 2*numvertices*sizeof(float[4]);
5286         DPSOFTRAST_Command_Draw *command;
5287         unsigned char *data;
5288         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5289         {
5290                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5291                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5292                         break;
5293                 datasize += numvertices*sizeof(float[4]);
5294         }
5295         if (element3s)
5296                 datasize += numtriangles*sizeof(unsigned short[3]);
5297         else if (element3i)
5298                 datasize += numtriangles*sizeof(int[3]);
5299         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5300         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5301         {
5302                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5303                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5304         }
5305         else
5306         {
5307                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5308                 data = (unsigned char *)command + commandsize;
5309         }
5310         command->firstvertex = firstvertex;
5311         command->numvertices = numvertices;
5312         command->numtriangles = numtriangles;
5313         command->arrays = (float *)data;
5314         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5315         dpsoftrast.firstvertex = firstvertex;
5316         dpsoftrast.numvertices = numvertices;
5317         dpsoftrast.screencoord4f = (float *)data;
5318         data += numvertices*sizeof(float[4]);
5319         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5320         data += numvertices*sizeof(float[4]);
5321         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5322         {
5323                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5324                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5325                         break;
5326                 dpsoftrast.post_array4f[j] = (float *)data;
5327                 data += numvertices*sizeof(float[4]);
5328         }
5329         command->element3i = NULL;
5330         command->element3s = NULL;
5331         if (element3s)
5332         {
5333                 command->element3s = (unsigned short *)data;
5334                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5335         }
5336         else if (element3i)
5337         {
5338                 command->element3i = (int *)data;
5339                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5340         }
5341         return command;
5342 }
5343
5344 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5345 {
5346         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5347         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5348         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5349         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5350         if (command->starty >= command->endy)
5351         {
5352                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5353                         MM_FREE(command->arrays);
5354                 DPSOFTRAST_UndoCommand(command->commandsize);
5355                 return;
5356         }
5357         command->clipped = dpsoftrast.drawclipped;
5358         command->refcount = dpsoftrast.numthreads;
5359
5360         if (dpsoftrast.usethreads)
5361         {
5362                 int i;
5363                 DPSOFTRAST_Draw_SyncCommands();
5364                 for (i = 0; i < dpsoftrast.numthreads; i++)
5365                 {
5366                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5367                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5368                                 Thread_CondSignal(thread->drawcond);
5369                 }
5370         }
5371         else
5372         {
5373                 DPSOFTRAST_Draw_FlushThreads();
5374         }
5375 }
5376
5377 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5378 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5379 {
5380         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5381 }
5382 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5383 {
5384         DPSOFTRAST_Command_SetRenderTargets *command;
5385         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5386                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5387                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5388                 DPSOFTRAST_Flush();
5389         dpsoftrast.fb_width = width;
5390         dpsoftrast.fb_height = height;
5391         dpsoftrast.fb_depthpixels = depthpixels;
5392         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5393         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5394         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5395         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5396         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5397         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5398         command->width = width;
5399         command->height = height;
5400 }
5401  
5402 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5403 {
5404         int commandoffset = thread->commandoffset;
5405         while (commandoffset != endoffset)
5406         {
5407                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5408                 switch (command->opcode)
5409                 {
5410 #define INTERPCOMMAND(name) \
5411                 case DPSOFTRAST_OPCODE_##name : \
5412                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5413                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5414                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5415                                 commandoffset = 0; \
5416                         break;
5417                 INTERPCOMMAND(Viewport)
5418                 INTERPCOMMAND(ClearColor)
5419                 INTERPCOMMAND(ClearDepth)
5420                 INTERPCOMMAND(ColorMask)
5421                 INTERPCOMMAND(DepthTest)
5422                 INTERPCOMMAND(ScissorTest)
5423                 INTERPCOMMAND(Scissor)
5424                 INTERPCOMMAND(BlendFunc)
5425                 INTERPCOMMAND(BlendSubtract)
5426                 INTERPCOMMAND(DepthMask)
5427                 INTERPCOMMAND(DepthFunc)
5428                 INTERPCOMMAND(DepthRange)
5429                 INTERPCOMMAND(PolygonOffset)
5430                 INTERPCOMMAND(CullFace)
5431                 INTERPCOMMAND(AlphaTest)
5432                 INTERPCOMMAND(AlphaFunc)
5433                 INTERPCOMMAND(SetTexture)
5434                 INTERPCOMMAND(SetShader)
5435                 INTERPCOMMAND(Uniform4f)
5436                 INTERPCOMMAND(UniformMatrix4f)
5437                 INTERPCOMMAND(Uniform1i)
5438                 INTERPCOMMAND(SetRenderTargets)
5439                 INTERPCOMMAND(ClipPlane)
5440
5441                 case DPSOFTRAST_OPCODE_Draw:
5442                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5443                         commandoffset += command->commandsize;
5444                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5445                                 commandoffset = 0;
5446                         thread->commandoffset = commandoffset;
5447                         break;
5448
5449                 case DPSOFTRAST_OPCODE_Reset:
5450                         commandoffset = 0;
5451                         break;
5452                 }
5453         }
5454         thread->commandoffset = commandoffset;
5455 }
5456
5457 static int DPSOFTRAST_Draw_Thread(void *data)
5458 {
5459         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5460         while(thread->index >= 0)
5461         {
5462                 if (thread->commandoffset != dpsoftrast.drawcommand)
5463                 {
5464                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5465                 }
5466                 else 
5467                 {
5468                         Thread_LockMutex(thread->drawmutex);
5469                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5470                         {
5471                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5472                                 thread->starving = true;
5473                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5474                                 thread->starving = false;
5475                         }
5476                         Thread_UnlockMutex(thread->drawmutex);
5477                 }
5478         }   
5479         return 0;
5480 }
5481
5482 static void DPSOFTRAST_Draw_FlushThreads(void)
5483 {
5484         DPSOFTRAST_State_Thread *thread;
5485         int i;
5486         DPSOFTRAST_Draw_SyncCommands();
5487         if (dpsoftrast.usethreads) 
5488         {
5489                 for (i = 0; i < dpsoftrast.numthreads; i++)
5490                 {
5491                         thread = &dpsoftrast.threads[i];
5492                         if (thread->commandoffset != dpsoftrast.drawcommand)
5493                         {
5494                                 Thread_LockMutex(thread->drawmutex);
5495                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5496                                         Thread_CondSignal(thread->drawcond);
5497                                 Thread_UnlockMutex(thread->drawmutex);
5498                         }
5499                 }
5500                 for (i = 0; i < dpsoftrast.numthreads; i++)
5501                 {
5502                         thread = &dpsoftrast.threads[i];
5503                         if (thread->commandoffset != dpsoftrast.drawcommand)
5504                         {
5505                                 Thread_LockMutex(thread->drawmutex);
5506                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5507                                 {
5508                                         thread->waiting = true;
5509                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5510                                         thread->waiting = false;
5511                                 }
5512                                 Thread_UnlockMutex(thread->drawmutex);
5513                         }
5514                 }
5515         }
5516         else
5517         {
5518                 for (i = 0; i < dpsoftrast.numthreads; i++)
5519                 {
5520                         thread = &dpsoftrast.threads[i];
5521                         if (thread->commandoffset != dpsoftrast.drawcommand)
5522                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5523                 }
5524         }
5525         dpsoftrast.commandpool.usedcommands = 0;
5526 }
5527
5528 void DPSOFTRAST_Flush(void)
5529 {
5530         DPSOFTRAST_Draw_FlushThreads();
5531 }
5532
5533 void DPSOFTRAST_Finish(void)
5534 {
5535         DPSOFTRAST_Flush();
5536 }
5537
5538 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5539 {
5540         int i;
5541         union
5542         {
5543                 int i;
5544                 unsigned char b[4];
5545         }
5546         u;
5547         u.i = 1;
5548         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5549         dpsoftrast.bigendian = u.b[3];
5550         dpsoftrast.fb_width = width;
5551         dpsoftrast.fb_height = height;
5552         dpsoftrast.fb_depthpixels = depthpixels;
5553         dpsoftrast.fb_colorpixels[0] = colorpixels;
5554         dpsoftrast.fb_colorpixels[1] = NULL;
5555         dpsoftrast.fb_colorpixels[1] = NULL;
5556         dpsoftrast.fb_colorpixels[1] = NULL;
5557         dpsoftrast.viewport[0] = 0;
5558         dpsoftrast.viewport[1] = 0;
5559         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5560         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5561         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5562         dpsoftrast.texture_firstfree = 1;
5563         dpsoftrast.texture_end = 1;
5564         dpsoftrast.texture_max = 0;
5565         dpsoftrast.color[0] = 1;
5566         dpsoftrast.color[1] = 1;
5567         dpsoftrast.color[2] = 1;
5568         dpsoftrast.color[3] = 1;
5569         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5570         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5571         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5572         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5573         for (i = 0; i < dpsoftrast.numthreads; i++)
5574         {
5575                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5576                 thread->index = i;
5577                 thread->cullface = GL_BACK;
5578         thread->colormask[0] = 1; 
5579                 thread->colormask[1] = 1;
5580                 thread->colormask[2] = 1;
5581                 thread->colormask[3] = 1;
5582                 thread->blendfunc[0] = GL_ONE;
5583                 thread->blendfunc[1] = GL_ZERO;
5584                 thread->depthmask = true;
5585                 thread->depthtest = true;
5586                 thread->depthfunc = GL_LEQUAL;
5587                 thread->scissortest = false;
5588                 thread->alphatest = false;
5589                 thread->alphafunc = GL_GREATER;
5590                 thread->alphavalue = 0.5f;
5591                 thread->viewport[0] = 0;
5592                 thread->viewport[1] = 0;
5593                 thread->viewport[2] = dpsoftrast.fb_width;
5594                 thread->viewport[3] = dpsoftrast.fb_height;
5595                 thread->scissor[0] = 0;
5596                 thread->scissor[1] = 0;
5597                 thread->scissor[2] = dpsoftrast.fb_width;
5598                 thread->scissor[3] = dpsoftrast.fb_height;
5599                 thread->depthrange[0] = 0;
5600                 thread->depthrange[1] = 1;
5601                 thread->polygonoffset[0] = 0;
5602                 thread->polygonoffset[1] = 0;
5603                 thread->clipplane[0] = 0;
5604                 thread->clipplane[1] = 0;
5605                 thread->clipplane[2] = 0;
5606                 thread->clipplane[3] = 1;
5607         
5608                 thread->numspans = 0;
5609                 thread->numtriangles = 0;
5610                 thread->commandoffset = 0;
5611                 thread->waiting = false;
5612                 thread->starving = false;
5613            
5614                 thread->validate = -1;
5615                 DPSOFTRAST_Validate(thread, -1);
5616  
5617                 if (dpsoftrast.usethreads)
5618                 {
5619                         thread->waitcond = Thread_CreateCond();
5620                         thread->drawcond = Thread_CreateCond();
5621                         thread->drawmutex = Thread_CreateMutex();
5622                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5623                 }
5624         }
5625         return 0;
5626 }
5627
5628 void DPSOFTRAST_Shutdown(void)
5629 {
5630         int i;
5631         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5632         {
5633                 DPSOFTRAST_State_Thread *thread;
5634                 for (i = 0; i < dpsoftrast.numthreads; i++)
5635                 {
5636                         thread = &dpsoftrast.threads[i];
5637                         Thread_LockMutex(thread->drawmutex);
5638                         thread->index = -1;
5639                         Thread_CondSignal(thread->drawcond);
5640                         Thread_UnlockMutex(thread->drawmutex);
5641                         Thread_WaitThread(thread->thread, 0);
5642                         Thread_DestroyCond(thread->waitcond);
5643                         Thread_DestroyCond(thread->drawcond);
5644                         Thread_DestroyMutex(thread->drawmutex);
5645                 }
5646         }
5647         for (i = 0;i < dpsoftrast.texture_end;i++)
5648                 if (dpsoftrast.texture[i].bytes)
5649                         MM_FREE(dpsoftrast.texture[i].bytes);
5650         if (dpsoftrast.texture)
5651                 free(dpsoftrast.texture);
5652         if (dpsoftrast.threads)
5653                 MM_FREE(dpsoftrast.threads);
5654         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5655 }
5656