]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
make sure gloss never calculates 0^0
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 #define DPSOFTRAST_FLT_MIN 0.000000000000000001f
111
112 typedef enum DPSOFTRAST_ARRAY_e
113 {
114         DPSOFTRAST_ARRAY_POSITION,
115         DPSOFTRAST_ARRAY_COLOR,
116         DPSOFTRAST_ARRAY_TEXCOORD0,
117         DPSOFTRAST_ARRAY_TEXCOORD1,
118         DPSOFTRAST_ARRAY_TEXCOORD2,
119         DPSOFTRAST_ARRAY_TEXCOORD3,
120         DPSOFTRAST_ARRAY_TEXCOORD4,
121         DPSOFTRAST_ARRAY_TEXCOORD5,
122         DPSOFTRAST_ARRAY_TEXCOORD6,
123         DPSOFTRAST_ARRAY_TEXCOORD7,
124         DPSOFTRAST_ARRAY_TOTAL
125 }
126 DPSOFTRAST_ARRAY;
127
128 typedef struct DPSOFTRAST_Texture_s
129 {
130         int flags;
131         int width;
132         int height;
133         int depth;
134         int sides;
135         DPSOFTRAST_TEXTURE_FILTER filter;
136         int mipmaps;
137         int size;
138         ATOMIC_COUNTER binds;
139         unsigned char *bytes;
140         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
141 }
142 DPSOFTRAST_Texture;
143
144 #define COMMAND_SIZE ALIGN_SIZE
145 #define COMMAND_ALIGN(var) ALIGN(var)
146
147 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
148 {
149         unsigned char opcode;
150         unsigned short commandsize;
151 }
152 DPSOFTRAST_Command);
153
154 enum { DPSOFTRAST_OPCODE_Reset = 0 };
155
156 #define DEFCOMMAND(opcodeval, name, fields) \
157         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
158         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
159         { \
160                 unsigned char opcode; \
161                 unsigned short commandsize; \
162                 fields \
163         } DPSOFTRAST_Command_##name );
164
165 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
166 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
167
168 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
169 {
170         int freecommand;
171         int usedcommands;
172         ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
173 }
174 DPSOFTRAST_State_Command_Pool);
175
176 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
177 {
178         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
179         float w[3];
180         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
181 }
182 DPSOFTRAST_State_Triangle);
183
184 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
185         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
186         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
187                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
188                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
189 }
190 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
191         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
192         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
193         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
194         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
195         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
196         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
197         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
198         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
199 }
200                                         
201 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
202
203 typedef ALIGN(struct DPSOFTRAST_State_Span_s
204 {
205         int triangle; // triangle this span was generated by
206         int x; // framebuffer x coord
207         int y; // framebuffer y coord
208         int startx; // usable range (according to pixelmask)
209         int endx; // usable range (according to pixelmask)
210         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
211         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
212         int depthslope; // depthbuffer value pixel delta
213 }
214 DPSOFTRAST_State_Span);
215
216 #define DPSOFTRAST_DRAW_MAXSPANS 1024
217 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
218 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
219
220 #define DPSOFTRAST_VALIDATE_FB 1
221 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
222 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
223 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
224
225 typedef enum DPSOFTRAST_BLENDMODE_e
226 {
227         DPSOFTRAST_BLENDMODE_OPAQUE,
228         DPSOFTRAST_BLENDMODE_ALPHA,
229         DPSOFTRAST_BLENDMODE_ADDALPHA,
230         DPSOFTRAST_BLENDMODE_ADD,
231         DPSOFTRAST_BLENDMODE_INVMOD,
232         DPSOFTRAST_BLENDMODE_MUL,
233         DPSOFTRAST_BLENDMODE_MUL2,
234         DPSOFTRAST_BLENDMODE_SUBALPHA,
235         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
236         DPSOFTRAST_BLENDMODE_INVADD,
237         DPSOFTRAST_BLENDMODE_TOTAL
238 }
239 DPSOFTRAST_BLENDMODE;
240
241 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
242 {
243         void *thread;
244         int index;
245         
246         int cullface;
247         int colormask[4];
248         int blendfunc[2];
249         int blendsubtract;
250         int depthmask;
251         int depthtest;
252         int depthfunc;
253         int scissortest;
254         int viewport[4];
255         int scissor[4];
256         float depthrange[2];
257         float polygonoffset[2];
258         float clipplane[4];
259         ALIGN(float fb_clipplane[4]);
260
261         int shader_mode;
262         int shader_permutation;
263         int shader_exactspecularmath;
264
265         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
266         
267         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
268         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
269
270         // DPSOFTRAST_VALIDATE_ flags
271         int validate;
272
273         // derived values (DPSOFTRAST_VALIDATE_FB)
274         int fb_colormask;
275         int fb_scissor[4];
276         ALIGN(float fb_viewportcenter[4]);
277         ALIGN(float fb_viewportscale[4]);
278
279         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
280         int fb_depthfunc;
281
282         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
283         int fb_blendmode;
284
285         // band boundaries
286         int miny1;
287         int maxy1;
288         int miny2;
289         int maxy2;
290
291         ATOMIC(volatile int commandoffset);
292
293         volatile bool waiting;
294         volatile bool starving;
295         void *waitcond;
296         void *drawcond;
297         void *drawmutex;
298
299         int numspans;
300         int numtriangles;
301         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
302         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
303         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
304 }
305 DPSOFTRAST_State_Thread);
306
307 typedef ALIGN(struct DPSOFTRAST_State_s
308 {
309         int fb_width;
310         int fb_height;
311         unsigned int *fb_depthpixels;
312         unsigned int *fb_colorpixels[4];
313
314         int viewport[4];
315         ALIGN(float fb_viewportcenter[4]);
316         ALIGN(float fb_viewportscale[4]);
317
318         float color[4];
319         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
320         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
321
322         const float *pointer_vertex3f;
323         const float *pointer_color4f;
324         const unsigned char *pointer_color4ub;
325         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
326         int stride_vertex;
327         int stride_color;
328         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
329         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
330         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
331
332         int firstvertex;
333         int numvertices;
334         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
335         float *screencoord4f;
336         int drawstarty;
337         int drawendy;
338         int drawclipped;
339         
340         int shader_mode;
341         int shader_permutation;
342         int shader_exactspecularmath;
343
344         int texture_max;
345         int texture_end;
346         int texture_firstfree;
347         DPSOFTRAST_Texture *texture;
348
349         int bigendian;
350
351         // error reporting
352         const char *errorstring;
353
354         bool usethreads;
355         int interlace;
356         int numthreads;
357         DPSOFTRAST_State_Thread *threads;
358
359         ATOMIC(volatile int drawcommand);
360
361         DPSOFTRAST_State_Command_Pool commandpool;
362 }
363 DPSOFTRAST_State);
364
365 DPSOFTRAST_State dpsoftrast;
366
367 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
368 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
369 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
370 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
371
372 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
373 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
374
375 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
376 {
377         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
378         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
379         fb_viewportcenter[3] = 0.5f;
380         fb_viewportcenter[0] = 0.0f;
381         fb_viewportscale[1] = 0.5f * viewport[2];
382         fb_viewportscale[2] = -0.5f * viewport[3];
383         fb_viewportscale[3] = 0.5f;
384         fb_viewportscale[0] = 1.0f;
385 }
386
387 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
388 {
389         if (dpsoftrast.interlace)
390         {
391                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
394                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
395         }
396         else
397         {
398                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
399                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
400         }
401 }
402
403 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
404 {
405         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
406         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
407         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
408         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
409         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
410 }
411
412 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
413 {
414         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
415         // and viewport projection values
416         int x1, x2;
417         int y1, y2;
418         x1 = thread->scissor[0];
419         x2 = thread->scissor[0] + thread->scissor[2];
420         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
421         y2 = dpsoftrast.fb_height - thread->scissor[1];
422         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
423         if (x1 < 0) x1 = 0;
424         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
425         if (y1 < 0) y1 = 0;
426         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
427         thread->fb_scissor[0] = x1;
428         thread->fb_scissor[1] = y1;
429         thread->fb_scissor[2] = x2 - x1;
430         thread->fb_scissor[3] = y2 - y1;
431
432         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
433         DPSOFTRAST_RecalcClipPlane(thread);
434         DPSOFTRAST_RecalcThread(thread);
435 }
436
437 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
438 {
439         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
440 }
441
442 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
443 {
444         if (thread->blendsubtract)
445         {
446                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
447                 {
448                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
449                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
450                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
451                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
452                 }
453         }
454         else
455         {       
456                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
457                 {
458                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
459                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
460                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
461                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
462                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
463                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
464                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
465                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
466                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
467                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
468                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
469                 }
470         }
471 }
472
473 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
474
475 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
476 {
477         mask &= thread->validate;
478         if (!mask)
479                 return;
480         if (mask & DPSOFTRAST_VALIDATE_FB)
481         {
482                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
483                 DPSOFTRAST_RecalcFB(thread);
484         }
485         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
486         {
487                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
488                 DPSOFTRAST_RecalcDepthFunc(thread);
489         }
490         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
491         {
492                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
493                 DPSOFTRAST_RecalcBlendFunc(thread);
494         }
495 }
496
497 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
498 {
499         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
500                 return &dpsoftrast.texture[index];
501         return NULL;
502 }
503
504 static void DPSOFTRAST_Texture_Grow(void)
505 {
506         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
507         DPSOFTRAST_State_Thread *thread;
508         int i;
509         int j;
510         DPSOFTRAST_Flush();
511         // expand texture array as needed
512         if (dpsoftrast.texture_max < 1024)
513                 dpsoftrast.texture_max = 1024;
514         else
515                 dpsoftrast.texture_max *= 2;
516         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
517         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
518                 if (dpsoftrast.texbound[i])
519                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
520         for (j = 0; j < dpsoftrast.numthreads; j++)
521         {
522                 thread = &dpsoftrast.threads[j];
523                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
524                         if (thread->texbound[i])
525                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
526         }
527 }
528
529 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
530 {
531         int w;
532         int h;
533         int d;
534         int size;
535         int s;
536         int texnum;
537         int mipmaps;
538         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
539         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
540         DPSOFTRAST_Texture *texture;
541         if (width*height*depth < 1)
542         {
543                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
544                 return 0;
545         }
546         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
547         {
548                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
549                 return 0;
550         }
551         switch(texformat)
552         {
553         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
554         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
555         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
556                 break;
557         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
558                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
559                 {
560                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
561                         return 0;
562                 }
563                 if (depth != 1)
564                 {
565                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
566                         return 0;
567                 }
568                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
569                 {
570                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
571                         return 0;
572                 }
573                 break;
574         }
575         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
576         {
577                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
578                 return 0;
579         }
580         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
581         {
582                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
583                 return 0;
584         }
585         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
586         {
587                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
588                 return 0;
589         }
590         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
591         {
592                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
593                 return 0;
594         }
595         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
596         {
597                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
598                 return 0;
599         }
600         // find first empty slot in texture array
601         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
602                 if (!dpsoftrast.texture[texnum].bytes)
603                         break;
604         dpsoftrast.texture_firstfree = texnum + 1;
605         if (dpsoftrast.texture_max <= texnum)
606                 DPSOFTRAST_Texture_Grow();
607         if (dpsoftrast.texture_end <= texnum)
608                 dpsoftrast.texture_end = texnum + 1;
609         texture = &dpsoftrast.texture[texnum];
610         memset(texture, 0, sizeof(*texture));
611         texture->flags = flags;
612         texture->width = width;
613         texture->height = height;
614         texture->depth = depth;
615         texture->sides = sides;
616         texture->binds = 0;
617         w = width;
618         h = height;
619         d = depth;
620         size = 0;
621         mipmaps = 0;
622         w = width;
623         h = height;
624         d = depth;
625         for (;;)
626         {
627                 s = w * h * d * sides * 4;
628                 texture->mipmap[mipmaps][0] = size;
629                 texture->mipmap[mipmaps][1] = s;
630                 texture->mipmap[mipmaps][2] = w;
631                 texture->mipmap[mipmaps][3] = h;
632                 texture->mipmap[mipmaps][4] = d;
633                 size += s;
634                 mipmaps++;
635                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
636                         break;
637                 if (w > 1) w >>= 1;
638                 if (h > 1) h >>= 1;
639                 if (d > 1) d >>= 1;
640         }
641         texture->mipmaps = mipmaps;
642         texture->size = size;
643
644         // allocate the pixels now
645         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
646
647         return texnum;
648 }
649 void DPSOFTRAST_Texture_Free(int index)
650 {
651         DPSOFTRAST_Texture *texture;
652         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
653         if (texture->binds)
654                 DPSOFTRAST_Flush();
655         if (texture->bytes)
656                 MM_FREE(texture->bytes);
657         texture->bytes = NULL;
658         memset(texture, 0, sizeof(*texture));
659         // adjust the free range and used range
660         if (dpsoftrast.texture_firstfree > index)
661                 dpsoftrast.texture_firstfree = index;
662         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
663                 dpsoftrast.texture_end--;
664 }
665 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
666 {
667         int i, x, y, z, w, layer0, layer1, row0, row1;
668         unsigned char *o, *i0, *i1, *i2, *i3;
669         DPSOFTRAST_Texture *texture;
670         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
671         if (texture->mipmaps <= 1)
672                 return;
673         for (i = 1;i < texture->mipmaps;i++)
674         {
675                 for (z = 0;z < texture->mipmap[i][4];z++)
676                 {
677                         layer0 = z*2;
678                         layer1 = z*2+1;
679                         if (layer1 >= texture->mipmap[i-1][4])
680                                 layer1 = texture->mipmap[i-1][4]-1;
681                         for (y = 0;y < texture->mipmap[i][3];y++)
682                         {
683                                 row0 = y*2;
684                                 row1 = y*2+1;
685                                 if (row1 >= texture->mipmap[i-1][3])
686                                         row1 = texture->mipmap[i-1][3]-1;
687                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
688                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
689                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
690                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
691                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
692                                 w = texture->mipmap[i][2];
693                                 if (layer1 > layer0)
694                                 {
695                                         if (texture->mipmap[i-1][2] > 1)
696                                         {
697                                                 // average 3D texture
698                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
699                                                 {
700                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
701                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
702                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
703                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
704                                                 }
705                                         }
706                                         else
707                                         {
708                                                 // average 3D mipmap with parent width == 1
709                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
710                                                 {
711                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
712                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
713                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
714                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
715                                                 }
716                                         }
717                                 }
718                                 else
719                                 {
720                                         if (texture->mipmap[i-1][2] > 1)
721                                         {
722                                                 // average 2D texture (common case)
723                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
724                                                 {
725                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
726                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
727                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
728                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
729                                                 }
730                                         }
731                                         else
732                                         {
733                                                 // 2D texture with parent width == 1
734                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
735                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
736                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
737                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
738                                         }
739                                 }
740                         }
741                 }
742         }
743 }
744 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
745 {
746         DPSOFTRAST_Texture *texture;
747         unsigned char *dst;
748         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
749         if (texture->binds)
750                 DPSOFTRAST_Flush();
751         if (pixels)
752         {
753                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
754                 while (blockheight > 0)
755                 {
756                         memcpy(dst, pixels, blockwidth * 4);
757                         pixels += blockwidth * 4;
758                         dst += texture->mipmap[0][2] * 4;
759                         blockheight--;
760                 }
761         }
762         DPSOFTRAST_Texture_CalculateMipmaps(index);
763 }
764 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
765 {
766         DPSOFTRAST_Texture *texture;
767         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
768         if (texture->binds)
769                 DPSOFTRAST_Flush();
770         if (pixels)
771                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
772         DPSOFTRAST_Texture_CalculateMipmaps(index);
773 }
774 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
775 {
776         DPSOFTRAST_Texture *texture;
777         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
778         return texture->mipmap[mip][2];
779 }
780 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
781 {
782         DPSOFTRAST_Texture *texture;
783         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
784         return texture->mipmap[mip][3];
785 }
786 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
787 {
788         DPSOFTRAST_Texture *texture;
789         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
790         return texture->mipmap[mip][4];
791 }
792 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
793 {
794         DPSOFTRAST_Texture *texture;
795         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
796         if (texture->binds)
797                 DPSOFTRAST_Flush();
798         return texture->bytes + texture->mipmap[mip][0];
799 }
800 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
801 {
802         DPSOFTRAST_Texture *texture;
803         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
804         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
805         {
806                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
807                 return;
808         }
809         if (texture->binds)
810                 DPSOFTRAST_Flush();
811         texture->filter = filter;
812 }
813
814 static void DPSOFTRAST_Draw_FlushThreads(void);
815
816 static void DPSOFTRAST_Draw_SyncCommands(void)
817 {
818         if(dpsoftrast.usethreads) MEMORY_BARRIER;
819         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
820 }
821
822 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
823 {
824         DPSOFTRAST_State_Thread *thread;
825         int i;
826         int freecommand = dpsoftrast.commandpool.freecommand;
827         int usedcommands = dpsoftrast.commandpool.usedcommands;
828         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
829                 return;
830         DPSOFTRAST_Draw_SyncCommands();
831         for(;;)
832         {
833                 int waitindex = -1;
834                 int commandoffset;
835                 usedcommands = 0;
836                 for (i = 0; i < dpsoftrast.numthreads; i++)
837                 {
838                         thread = &dpsoftrast.threads[i]; 
839                         commandoffset = freecommand - thread->commandoffset;
840                         if (commandoffset < 0)
841                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
842                         if (commandoffset > usedcommands)
843                         {
844                                 waitindex = i;
845                                 usedcommands = commandoffset;
846                         }
847                 }
848                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
849                         break;
850                 thread = &dpsoftrast.threads[waitindex];
851                 Thread_LockMutex(thread->drawmutex);
852                 if (thread->commandoffset != dpsoftrast.drawcommand)
853                 {
854                         thread->waiting = true;
855                         if (thread->starving) Thread_CondSignal(thread->drawcond);
856                         Thread_CondWait(thread->waitcond, thread->drawmutex);
857                         thread->waiting = false;
858                 }
859                 Thread_UnlockMutex(thread->drawmutex);
860         }
861         dpsoftrast.commandpool.usedcommands = usedcommands;
862 }
863
864 #define DPSOFTRAST_ALIGNCOMMAND(size) \
865         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
866 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
867         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
868
869 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
870 {
871         DPSOFTRAST_Command *command;
872         int freecommand = dpsoftrast.commandpool.freecommand;
873         int usedcommands = dpsoftrast.commandpool.usedcommands;
874         int extra = sizeof(DPSOFTRAST_Command);
875         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
876                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
877         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
878         {
879                 if (dpsoftrast.usethreads)
880                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
881                 else
882                         DPSOFTRAST_Draw_FlushThreads();
883                 freecommand = dpsoftrast.commandpool.freecommand;
884                 usedcommands = dpsoftrast.commandpool.usedcommands;
885         }
886         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
887         {
888                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
889                 command->opcode = DPSOFTRAST_OPCODE_Reset;
890                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
891                 freecommand = 0;
892         }
893         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
894         command->opcode = opcode;
895         command->commandsize = size;
896         freecommand += size;
897         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
898                 freecommand = 0;
899         dpsoftrast.commandpool.freecommand = freecommand;
900         dpsoftrast.commandpool.usedcommands = usedcommands + size;
901         return command;
902 }
903
904 static void DPSOFTRAST_UndoCommand(int size)
905 {
906         int freecommand = dpsoftrast.commandpool.freecommand;
907         int usedcommands = dpsoftrast.commandpool.usedcommands;
908         freecommand -= size;
909         if (freecommand < 0)
910                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
911         usedcommands -= size;
912         dpsoftrast.commandpool.freecommand = freecommand;
913         dpsoftrast.commandpool.usedcommands = usedcommands;
914 }
915                 
916 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
917 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
918 {
919         thread->viewport[0] = command->x;
920         thread->viewport[1] = command->y;
921         thread->viewport[2] = command->width;
922         thread->viewport[3] = command->height;
923         thread->validate |= DPSOFTRAST_VALIDATE_FB;
924 }
925 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
926 {
927         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
928         command->x = x;
929         command->y = y;
930         command->width = width;
931         command->height = height;
932
933         dpsoftrast.viewport[0] = x;
934         dpsoftrast.viewport[1] = y;
935         dpsoftrast.viewport[2] = width;
936         dpsoftrast.viewport[3] = height;
937         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
938 }
939
940 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
941 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
942 {
943         int i, x1, y1, x2, y2, w, h, x, y;
944         int miny1, maxy1, miny2, maxy2;
945         int bandy;
946         unsigned int *p;
947         unsigned int c;
948         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
949         miny1 = thread->miny1;
950         maxy1 = thread->maxy1;
951         miny2 = thread->miny2;
952         maxy2 = thread->maxy2;
953         x1 = thread->fb_scissor[0];
954         y1 = thread->fb_scissor[1];
955         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
956         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
957         if (y1 < miny1) y1 = miny1;
958         if (y2 > maxy2) y2 = maxy2;
959         w = x2 - x1;
960         h = y2 - y1;
961         if (w < 1 || h < 1)
962                 return;
963         // FIXME: honor fb_colormask?
964         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
965         for (i = 0;i < 4;i++)
966         {
967                 if (!dpsoftrast.fb_colorpixels[i])
968                         continue;
969                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
970                 for (;y < bandy;y++)
971                 {
972                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
973                         for (x = x1;x < x2;x++)
974                                 p[x] = c;
975                 }
976         }
977 }
978 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
979 {
980         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
981         command->r = r;
982         command->g = g;
983         command->b = b;
984         command->a = a;
985 }
986
987 DEFCOMMAND(3, ClearDepth, float depth;)
988 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
989 {
990         int x1, y1, x2, y2, w, h, x, y;
991         int miny1, maxy1, miny2, maxy2;
992         int bandy;
993         unsigned int *p;
994         unsigned int c;
995         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
996         miny1 = thread->miny1;
997         maxy1 = thread->maxy1;
998         miny2 = thread->miny2;
999         maxy2 = thread->maxy2;
1000         x1 = thread->fb_scissor[0];
1001         y1 = thread->fb_scissor[1];
1002         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1003         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1004         if (y1 < miny1) y1 = miny1;
1005         if (y2 > maxy2) y2 = maxy2;
1006         w = x2 - x1;
1007         h = y2 - y1;
1008         if (w < 1 || h < 1)
1009                 return;
1010         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1011         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1012         for (;y < bandy;y++)
1013         {
1014                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1015                 for (x = x1;x < x2;x++)
1016                         p[x] = c;
1017         }
1018 }
1019 void DPSOFTRAST_ClearDepth(float d)
1020 {
1021         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1022         command->depth = d;
1023 }
1024
1025 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1026 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1027 {
1028         thread->colormask[0] = command->r != 0;
1029         thread->colormask[1] = command->g != 0;
1030         thread->colormask[2] = command->b != 0;
1031         thread->colormask[3] = command->a != 0;
1032         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1033 }
1034 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1035 {
1036         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1037         command->r = r;
1038         command->g = g;
1039         command->b = b;
1040         command->a = a;
1041 }
1042
1043 DEFCOMMAND(5, DepthTest, int enable;)
1044 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1045 {
1046         thread->depthtest = command->enable;
1047         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1048 }
1049 void DPSOFTRAST_DepthTest(int enable)
1050 {
1051         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1052         command->enable = enable;
1053 }
1054
1055 DEFCOMMAND(6, ScissorTest, int enable;)
1056 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1057 {
1058         thread->scissortest = command->enable;
1059         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1060 }
1061 void DPSOFTRAST_ScissorTest(int enable)
1062 {
1063         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1064         command->enable = enable;
1065 }
1066
1067 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1068 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1069 {
1070         thread->scissor[0] = command->x;
1071         thread->scissor[1] = command->y;
1072         thread->scissor[2] = command->width;
1073         thread->scissor[3] = command->height;
1074         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1075 }
1076 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1077 {
1078         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1079         command->x = x;
1080         command->y = y;
1081         command->width = width;
1082         command->height = height;
1083 }
1084
1085 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1086 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1087 {
1088         thread->blendfunc[0] = command->sfactor;
1089         thread->blendfunc[1] = command->dfactor;
1090         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1091 }
1092 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1093 {
1094         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1095         command->sfactor = sfactor;
1096         command->dfactor = dfactor;
1097 }
1098
1099 DEFCOMMAND(9, BlendSubtract, int enable;)
1100 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1101 {
1102         thread->blendsubtract = command->enable;
1103         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1104 }
1105 void DPSOFTRAST_BlendSubtract(int enable)
1106 {
1107         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1108         command->enable = enable;
1109 }
1110
1111 DEFCOMMAND(10, DepthMask, int enable;)
1112 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1113 {
1114         thread->depthmask = command->enable;
1115 }
1116 void DPSOFTRAST_DepthMask(int enable)
1117 {
1118         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1119         command->enable = enable;
1120 }
1121
1122 DEFCOMMAND(11, DepthFunc, int func;)
1123 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1124 {
1125         thread->depthfunc = command->func;
1126 }
1127 void DPSOFTRAST_DepthFunc(int func)
1128 {
1129         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1130         command->func = func;
1131 }
1132
1133 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1134 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1135 {
1136         thread->depthrange[0] = command->nearval;
1137         thread->depthrange[1] = command->farval;
1138 }
1139 void DPSOFTRAST_DepthRange(float nearval, float farval)
1140 {
1141         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1142         command->nearval = nearval;
1143         command->farval = farval;
1144 }
1145
1146 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1147 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1148 {
1149         thread->polygonoffset[0] = command->alongnormal;
1150         thread->polygonoffset[1] = command->intoview;
1151 }
1152 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1153 {
1154         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1155         command->alongnormal = alongnormal;
1156         command->intoview = intoview;
1157 }
1158
1159 DEFCOMMAND(14, CullFace, int mode;)
1160 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1161 {
1162         thread->cullface = command->mode;
1163 }
1164 void DPSOFTRAST_CullFace(int mode)
1165 {
1166         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1167         command->mode = mode;
1168 }
1169
1170 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1171 {
1172         dpsoftrast.color[0] = r;
1173         dpsoftrast.color[1] = g;
1174         dpsoftrast.color[2] = b;
1175         dpsoftrast.color[3] = a;
1176 }
1177
1178 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1179 {
1180         int outstride = blockwidth * 4;
1181         int instride = dpsoftrast.fb_width * 4;
1182         int bx1 = blockx;
1183         int by1 = blocky;
1184         int bx2 = blockx + blockwidth;
1185         int by2 = blocky + blockheight;
1186         int bw;
1187         int x;
1188         int y;
1189         unsigned char *inpixels;
1190         unsigned char *b;
1191         unsigned char *o;
1192         DPSOFTRAST_Flush();
1193         if (bx1 < 0) bx1 = 0;
1194         if (by1 < 0) by1 = 0;
1195         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1196         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1197         bw = bx2 - bx1;
1198         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1199         if (dpsoftrast.bigendian)
1200         {
1201                 for (y = by1;y < by2;y++)
1202                 {
1203                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1204                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1205                         for (x = bx1;x < bx2;x++)
1206                         {
1207                                 o[0] = b[3];
1208                                 o[1] = b[2];
1209                                 o[2] = b[1];
1210                                 o[3] = b[0];
1211                                 o += 4;
1212                                 b += 4;
1213                         }
1214                 }
1215         }
1216         else
1217         {
1218                 for (y = by1;y < by2;y++)
1219                 {
1220                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1221                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1222                         memcpy(o, b, bw*4);
1223                 }
1224         }
1225
1226 }
1227 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1228 {
1229         int tx1 = tx;
1230         int ty1 = ty;
1231         int tx2 = tx + width;
1232         int ty2 = ty + height;
1233         int sx1 = sx;
1234         int sy1 = sy;
1235         int sx2 = sx + width;
1236         int sy2 = sy + height;
1237         int swidth;
1238         int sheight;
1239         int twidth;
1240         int theight;
1241         int sw;
1242         int sh;
1243         int tw;
1244         int th;
1245         int y;
1246         unsigned int *spixels;
1247         unsigned int *tpixels;
1248         DPSOFTRAST_Texture *texture;
1249         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1250         if (mip < 0 || mip >= texture->mipmaps) return;
1251         DPSOFTRAST_Flush();
1252         spixels = dpsoftrast.fb_colorpixels[0];
1253         swidth = dpsoftrast.fb_width;
1254         sheight = dpsoftrast.fb_height;
1255         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1256         twidth = texture->mipmap[mip][2];
1257         theight = texture->mipmap[mip][3];
1258         if (tx1 < 0) tx1 = 0;
1259         if (ty1 < 0) ty1 = 0;
1260         if (tx2 > twidth) tx2 = twidth;
1261         if (ty2 > theight) ty2 = theight;
1262         if (sx1 < 0) sx1 = 0;
1263         if (sy1 < 0) sy1 = 0;
1264         if (sx2 > swidth) sx2 = swidth;
1265         if (sy2 > sheight) sy2 = sheight;
1266         tw = tx2 - tx1;
1267         th = ty2 - ty1;
1268         sw = sx2 - sx1;
1269         sh = sy2 - sy1;
1270         if (tw > sw) tw = sw;
1271         if (th > sh) th = sh;
1272         if (tw < 1 || th < 1)
1273                 return;
1274         sy1 = sheight - 1 - sy1;
1275         for (y = 0;y < th;y++)
1276                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1277         if (texture->mipmaps > 1)
1278                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1279 }
1280
1281 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1282 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1283 {
1284         if (thread->texbound[command->unitnum])
1285                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1286         thread->texbound[command->unitnum] = command->texture;
1287 }
1288 void DPSOFTRAST_SetTexture(int unitnum, int index)
1289 {
1290         DPSOFTRAST_Command_SetTexture *command;
1291         DPSOFTRAST_Texture *texture;
1292         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1293         {
1294                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1295                 return;
1296         }
1297         texture = DPSOFTRAST_Texture_GetByIndex(index);
1298         if (index && !texture)
1299         {
1300                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1301                 return;
1302         }
1303
1304         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1305         command->unitnum = unitnum;
1306         command->texture = texture;
1307
1308         dpsoftrast.texbound[unitnum] = texture;
1309         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1310 }
1311
1312 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1313 {
1314         dpsoftrast.pointer_vertex3f = vertex3f;
1315         dpsoftrast.stride_vertex = stride;
1316 }
1317 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1318 {
1319         dpsoftrast.pointer_color4f = color4f;
1320         dpsoftrast.pointer_color4ub = NULL;
1321         dpsoftrast.stride_color = stride;
1322 }
1323 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1324 {
1325         dpsoftrast.pointer_color4f = NULL;
1326         dpsoftrast.pointer_color4ub = color4ub;
1327         dpsoftrast.stride_color = stride;
1328 }
1329 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1330 {
1331         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1332         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1333         dpsoftrast.stride_texcoord[unitnum] = stride;
1334 }
1335
1336 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1337 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1338 {
1339         thread->shader_mode = command->mode;
1340         thread->shader_permutation = command->permutation;
1341         thread->shader_exactspecularmath = command->exactspecularmath;
1342 }
1343 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1344 {
1345         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1346         command->mode = mode;
1347         command->permutation = permutation;
1348         command->exactspecularmath = exactspecularmath;
1349
1350         dpsoftrast.shader_mode = mode;
1351         dpsoftrast.shader_permutation = permutation;
1352         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1353 }
1354
1355 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1356 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1357 {
1358         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1359 }
1360 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1361 {
1362         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1363         command->index = index;
1364         command->val[0] = v0;
1365         command->val[1] = v1;
1366         command->val[2] = v2;
1367         command->val[3] = v3;
1368
1369         dpsoftrast.uniform4f[index*4+0] = v0;
1370         dpsoftrast.uniform4f[index*4+1] = v1;
1371         dpsoftrast.uniform4f[index*4+2] = v2;
1372         dpsoftrast.uniform4f[index*4+3] = v3;
1373 }
1374 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1375 {
1376         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1377         command->index = index;
1378         memcpy(command->val, v, sizeof(command->val));
1379
1380         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1381 }
1382
1383 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1384 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1385 {
1386         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1387 }
1388 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1389 {
1390 #ifdef SSE_POSSIBLE
1391         int i, index;
1392         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1393         {
1394                 __m128 m0, m1, m2, m3;
1395                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1396                 command->index = (DPSOFTRAST_UNIFORM)index;
1397                 if (((size_t)v)&(ALIGN_SIZE-1))
1398                 {
1399                         m0 = _mm_loadu_ps(v);
1400                         m1 = _mm_loadu_ps(v+4);
1401                         m2 = _mm_loadu_ps(v+8);
1402                         m3 = _mm_loadu_ps(v+12);
1403                 }
1404                 else
1405                 {
1406                         m0 = _mm_load_ps(v);
1407                         m1 = _mm_load_ps(v+4);
1408                         m2 = _mm_load_ps(v+8);
1409                         m3 = _mm_load_ps(v+12);
1410                 }
1411                 if (transpose)
1412                 {
1413                         __m128 t0, t1, t2, t3;
1414                         t0 = _mm_unpacklo_ps(m0, m1);
1415                         t1 = _mm_unpacklo_ps(m2, m3);
1416                         t2 = _mm_unpackhi_ps(m0, m1);
1417                         t3 = _mm_unpackhi_ps(m2, m3);
1418                         m0 = _mm_movelh_ps(t0, t1);
1419                         m1 = _mm_movehl_ps(t1, t0);
1420                         m2 = _mm_movelh_ps(t2, t3);
1421                         m3 = _mm_movehl_ps(t3, t2);                     
1422                 }
1423                 _mm_store_ps(command->val, m0);
1424                 _mm_store_ps(command->val+4, m1);
1425                 _mm_store_ps(command->val+8, m2);
1426                 _mm_store_ps(command->val+12, m3);
1427                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1428                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1429                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1430                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1431         }
1432 #endif
1433 }
1434
1435 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1436 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1437 {
1438         thread->uniform1i[command->index] = command->val;
1439 }
1440 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1441 {
1442         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1443         command->index = index;
1444         command->val = i0;
1445
1446         dpsoftrast.uniform1i[command->index] = i0;
1447 }
1448
1449 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1450 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1451 {
1452         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1453         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1454 }
1455 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1456 {
1457         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1458         command->clipplane[0] = x;
1459         command->clipplane[1] = y;
1460         command->clipplane[2] = z;
1461         command->clipplane[3] = w;
1462 }
1463
1464 #ifdef SSE_POSSIBLE
1465 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1466 {
1467         float *end = dst + size*4;
1468         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1469         {
1470                 while (dst < end)
1471                 {
1472                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1473                         dst += 4;
1474                         src += stride;
1475                 }
1476         }
1477         else
1478         {
1479                 while (dst < end)
1480                 {
1481                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1482                         dst += 4;
1483                         src += stride;
1484                 }
1485         }
1486 }
1487
1488 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1489 {
1490         float *end = dst + size*4;
1491         if (stride == sizeof(float[3]))
1492         {
1493                 float *end4 = dst + (size&~3)*4;        
1494                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1495                 {
1496                         while (dst < end4)
1497                         {
1498                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1499                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1500                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1501                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1502                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1503                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1504                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1505                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1506                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1507                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1508                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1510                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1511                                 dst += 16;
1512                                 src += 4*sizeof(float[3]);
1513                         }
1514                 }
1515                 else
1516                 {
1517                         while (dst < end4)
1518                         {
1519                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1520                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1521                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1522                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1523                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1524                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1525                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1526                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1527                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1528                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1529                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1531                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1532                                 dst += 16;
1533                                 src += 4*sizeof(float[3]);
1534                         }
1535                 }
1536         }
1537         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1538         {
1539                 while (dst < end)
1540                 {
1541                         __m128 v = _mm_loadu_ps((const float *)src);
1542                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1543                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1544                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1545                         _mm_store_ps(dst, v);
1546                         dst += 4;
1547                         src += stride;
1548                 }
1549         }
1550         else
1551         {
1552                 while (dst < end)
1553                 {
1554                         __m128 v = _mm_load_ps((const float *)src);
1555                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1556                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1557                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1558                         _mm_store_ps(dst, v);
1559                         dst += 4;
1560                         src += stride;
1561                 }
1562         }
1563 }
1564
1565 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1566 {
1567         float *end = dst + size*4;
1568         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1569         if (stride == sizeof(float[2]))
1570         {
1571                 float *end2 = dst + (size&~1)*4;
1572                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1573                 {
1574                         while (dst < end2)
1575                         {
1576                                 __m128 v = _mm_loadu_ps((const float *)src);
1577                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1578                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1579                                 dst += 8;
1580                                 src += 2*sizeof(float[2]);
1581                         }
1582                 }
1583                 else
1584                 {
1585                         while (dst < end2)
1586                         {
1587                                 __m128 v = _mm_load_ps((const float *)src);
1588                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1589                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1590                                 dst += 8;
1591                                 src += 2*sizeof(float[2]);
1592                         }
1593                 }
1594         }
1595         while (dst < end)
1596         {
1597                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1598                 dst += 4;
1599                 src += stride;
1600         }
1601 }
1602
1603 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1604 {
1605         float *end = dst + size*4;
1606         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1607         if (stride == sizeof(unsigned char[4]))
1608         {
1609                 float *end4 = dst + (size&~3)*4;
1610                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1611                 {
1612                         while (dst < end4)
1613                         {
1614                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1615                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1616                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1617                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1618                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1619                                 dst += 16;
1620                                 src += 4*sizeof(unsigned char[4]);
1621                         }
1622                 }
1623                 else
1624                 {
1625                         while (dst < end4)
1626                         {
1627                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1628                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1629                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1630                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1631                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1632                                 dst += 16;
1633                                 src += 4*sizeof(unsigned char[4]);
1634                         }
1635                 }
1636         }
1637         while (dst < end)
1638         {
1639                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1640                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1641                 dst += 4;
1642                 src += stride;
1643         }
1644 }
1645
1646 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1647 {
1648         float *end = dst + 4*size;
1649         __m128 v = _mm_loadu_ps(src);
1650         while (dst < end)
1651         {
1652                 _mm_store_ps(dst, v);
1653                 dst += 4;
1654         }
1655 }
1656 #endif
1657
1658 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1659 {
1660 #ifdef SSE_POSSIBLE
1661         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1662         __m128 m0, m1, m2, m3;
1663         float *end;
1664         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1665         {
1666                 // fast case for identity matrix
1667                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1668                 return;
1669         }
1670         end = out4f + numitems*4;
1671         m0 = _mm_loadu_ps(inmatrix16f);
1672         m1 = _mm_loadu_ps(inmatrix16f + 4);
1673         m2 = _mm_loadu_ps(inmatrix16f + 8);
1674         m3 = _mm_loadu_ps(inmatrix16f + 12);
1675         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1676         {
1677                 while (out4f < end)
1678                 {
1679                         __m128 v = _mm_loadu_ps(in4f);
1680                         _mm_store_ps(out4f,
1681                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1682                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1683                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1684                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1685                         out4f += 4;
1686                         in4f += 4;
1687                 }
1688         }
1689         else
1690         {
1691                 while (out4f < end)
1692                 {
1693                         __m128 v = _mm_load_ps(in4f);
1694                         _mm_store_ps(out4f,
1695                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1696                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1697                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1698                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1699                         out4f += 4;
1700                         in4f += 4;
1701                 }
1702         }
1703 #endif
1704 }
1705
1706 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1707 {
1708         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1709 }
1710
1711 #ifdef SSE_POSSIBLE
1712 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1713 { \
1714         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1715         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1716         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1717         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1718 }
1719
1720 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1721 { \
1722         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1723         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1724         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1725         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1726 }
1727
1728 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1729 { \
1730         __m128 p = (in); \
1731         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1732                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1733                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1734                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1735 }
1736
1737 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1738 {
1739         int clipmask = 0xFF;
1740         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1741         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1742         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1743         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1744         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1745         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1746         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1747         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1748         #define BBFRONT(k, pos) \
1749         { \
1750                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1751                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1752                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1753                 { \
1754                         __m128 proj; \
1755                         clipmask &= ~(1<<k); \
1756                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1757                         minproj = _mm_min_ss(minproj, proj); \
1758                         maxproj = _mm_max_ss(maxproj, proj); \
1759                 } \
1760         }
1761         BBFRONT(0, minpos); 
1762         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1763         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1764         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1765         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1766         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1767         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1768         BBFRONT(7, maxpos);
1769         #define BBCLIP(k) \
1770         { \
1771                 if (clipmask&(1<<k)) \
1772                 { \
1773                         if (!(clipmask&(1<<(k^1)))) \
1774                         { \
1775                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1776                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1777                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1778                                 minproj = _mm_min_ss(minproj, proj); \
1779                                 maxproj = _mm_max_ss(maxproj, proj); \
1780                         } \
1781                         if (!(clipmask&(1<<(k^2)))) \
1782                         { \
1783                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1784                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1785                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1786                                 minproj = _mm_min_ss(minproj, proj); \
1787                                 maxproj = _mm_max_ss(maxproj, proj); \
1788                         } \
1789                         if (!(clipmask&(1<<(k^4)))) \
1790                         { \
1791                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1792                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1793                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1794                                 minproj = _mm_min_ss(minproj, proj); \
1795                                 maxproj = _mm_max_ss(maxproj, proj); \
1796                         } \
1797                 } \
1798         }
1799         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1800         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1801         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1802         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1803         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1804         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1805         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1806         *starty = _mm_cvttss_si32(maxproj);
1807         *endy = _mm_cvttss_si32(minproj)+1;
1808         return clipmask;
1809 }
1810         
1811 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1812 {
1813         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1814         float *end = out4f + numitems*4;
1815         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1816         __m128 minpos, maxpos;
1817         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1818         {
1819                 minpos = maxpos = _mm_loadu_ps(in4f);
1820                 while (out4f < end)
1821                 {
1822                         __m128 v = _mm_loadu_ps(in4f);
1823                         minpos = _mm_min_ps(minpos, v);
1824                         maxpos = _mm_max_ps(maxpos, v);
1825                         _mm_store_ps(out4f, v);
1826                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1827                         _mm_store_ps(screen4f, v);
1828                         in4f += 4;
1829                         out4f += 4;
1830                         screen4f += 4;
1831                 }
1832         }
1833         else
1834         {
1835                 minpos = maxpos = _mm_load_ps(in4f);
1836                 while (out4f < end)
1837                 {
1838                         __m128 v = _mm_load_ps(in4f);
1839                         minpos = _mm_min_ps(minpos, v);
1840                         maxpos = _mm_max_ps(maxpos, v);
1841                         _mm_store_ps(out4f, v);
1842                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843                         _mm_store_ps(screen4f, v);
1844                         in4f += 4;
1845                         out4f += 4;
1846                         screen4f += 4;
1847                 }
1848         }
1849         if (starty && endy) 
1850         {
1851                 ALIGN(float minposf[4]);
1852                 ALIGN(float maxposf[4]);
1853                 _mm_store_ps(minposf, minpos);
1854                 _mm_store_ps(maxposf, maxpos);
1855                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1856         }
1857         return 0;
1858 }
1859
1860 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1861 {
1862         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1863         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1864         float *end;
1865         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1866                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1867         end = out4f + numitems*4;
1868         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1869         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1870         m0 = _mm_loadu_ps(inmatrix16f);
1871         m1 = _mm_loadu_ps(inmatrix16f + 4);
1872         m2 = _mm_loadu_ps(inmatrix16f + 8);
1873         m3 = _mm_loadu_ps(inmatrix16f + 12);
1874         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1875         {
1876                 minpos = maxpos = _mm_loadu_ps(in4f);
1877                 while (out4f < end)
1878                 {
1879                         __m128 v = _mm_loadu_ps(in4f);
1880                         minpos = _mm_min_ps(minpos, v);
1881                         maxpos = _mm_max_ps(maxpos, v);
1882                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1883                         _mm_store_ps(out4f, v);
1884                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1885                         _mm_store_ps(screen4f, v);
1886                         in4f += 4;
1887                         out4f += 4;
1888                         screen4f += 4;
1889                 }
1890         }
1891         else
1892         {
1893                 minpos = maxpos = _mm_load_ps(in4f);
1894                 while (out4f < end)
1895                 {
1896                         __m128 v = _mm_load_ps(in4f);
1897                         minpos = _mm_min_ps(minpos, v);
1898                         maxpos = _mm_max_ps(maxpos, v);
1899                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1900                         _mm_store_ps(out4f, v);
1901                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1902                         _mm_store_ps(screen4f, v);
1903                         in4f += 4;
1904                         out4f += 4;
1905                         screen4f += 4;
1906                 }
1907         }
1908         if (starty && endy) 
1909         {
1910                 ALIGN(float minposf[4]);
1911                 ALIGN(float maxposf[4]);
1912                 _mm_store_ps(minposf, minpos);
1913                 _mm_store_ps(maxposf, maxpos);
1914                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1915         }
1916         return 0;
1917 }
1918 #endif
1919
1920 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1921 {
1922 #ifdef SSE_POSSIBLE
1923         float *outf = dpsoftrast.post_array4f[outarray];
1924         const unsigned char *inb;
1925         int firstvertex = dpsoftrast.firstvertex;
1926         int numvertices = dpsoftrast.numvertices;
1927         int stride;
1928         switch(inarray)
1929         {
1930         case DPSOFTRAST_ARRAY_POSITION:
1931                 stride = dpsoftrast.stride_vertex;
1932                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1933                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1934                 break;
1935         case DPSOFTRAST_ARRAY_COLOR:
1936                 stride = dpsoftrast.stride_color;
1937                 if (dpsoftrast.pointer_color4f)
1938                 {
1939                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1940                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1941                 }
1942                 else if (dpsoftrast.pointer_color4ub)
1943                 {
1944                         stride = dpsoftrast.stride_color;
1945                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1946                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1947                 }
1948                 else
1949                 {
1950                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1951                 }
1952                 break;
1953         default:
1954                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1955                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1956                 {
1957                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1958                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1959                         {
1960                         case 2:
1961                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1962                                 break;
1963                         case 3:
1964                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1965                                 break;
1966                         case 4:
1967                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1968                                 break;
1969                         }
1970                 }
1971                 break;
1972         }
1973         return outf;
1974 #else
1975         return NULL;
1976 #endif
1977 }
1978
1979 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1980 {
1981         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1982         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1983         return data;
1984 }
1985
1986 #if 0
1987 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1988 {
1989 #ifdef SSE_POSSIBLE
1990         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1991         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1992         return data;
1993 #else
1994         return NULL;
1995 #endif
1996 }
1997 #endif
1998
1999 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2000 {
2001 #ifdef SSE_POSSIBLE
2002         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2003         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2004         return data;
2005 #else
2006         return NULL;
2007 #endif
2008 }
2009
2010 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2011 {
2012         int x;
2013         int startx = span->startx;
2014         int endx = span->endx;
2015         float wslope = triangle->w[0];
2016         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2017         float endz = 1.0f / (w + wslope * startx);
2018         if (triangle->w[0] == 0)
2019         {
2020                 // LordHavoc: fast flat polygons (HUD/menu)
2021                 for (x = startx;x < endx;x++)
2022                         zf[x] = endz;
2023                 return;
2024         }
2025         for (x = startx;x < endx;)
2026         {
2027                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2028                 float z = endz, dz;
2029                 if (nextsub >= endx) nextsub = endsub = endx-1;
2030                 endz = 1.0f / (w + wslope * nextsub);
2031                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2032                 for (; x <= endsub; x++, z += dz)
2033                         zf[x] = z;
2034         }
2035 }
2036
2037 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2038 {
2039 #ifdef SSE_POSSIBLE
2040         int x;
2041         int startx = span->startx;
2042         int endx = span->endx;
2043         int maskx;
2044         int subx;
2045         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2046         unsigned char * RESTRICT pixelmask = span->pixelmask;
2047         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2048         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2049         if (!pixel)
2050                 return;
2051         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2052         pixeli += span->y * dpsoftrast.fb_width + span->x;
2053         // handle alphatest now (this affects depth writes too)
2054         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2055                 for (x = startx;x < endx;x++)
2056                         if (in4ub[x*4+3] < 128)
2057                                 pixelmask[x] = false;
2058         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2059         // helps sprites, text and hud artwork
2060         switch(thread->fb_blendmode)
2061         {
2062         case DPSOFTRAST_BLENDMODE_ALPHA:
2063         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2064         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2065                 maskx = startx;
2066                 for (x = startx;x < endx;x++)
2067                 {
2068                         if (in4ub[x*4+3] >= 1)
2069                         {
2070                                 startx = x;
2071                                 for (;;)
2072                                 {
2073                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2074                                         maskx = x;
2075                                         if (x >= endx) break;
2076                                         ++x;
2077                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2078                                         if (x >= endx) break;
2079                                 }
2080                                 break;
2081                         }
2082                 }
2083                 endx = maskx;
2084                 break;
2085         case DPSOFTRAST_BLENDMODE_OPAQUE:
2086         case DPSOFTRAST_BLENDMODE_ADD:
2087         case DPSOFTRAST_BLENDMODE_INVMOD:
2088         case DPSOFTRAST_BLENDMODE_MUL:
2089         case DPSOFTRAST_BLENDMODE_MUL2:
2090         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2091         case DPSOFTRAST_BLENDMODE_INVADD:
2092                 break;
2093         }
2094         // put some special values at the end of the mask to ensure the loops end
2095         pixelmask[endx] = 1;
2096         pixelmask[endx+1] = 0;
2097         // LordHavoc: use a double loop to identify subspans, this helps the
2098         // optimized copy/blend loops to perform at their best, most triangles
2099         // have only one run of pixels, and do the search using wide reads...
2100         x = startx;
2101         while (x < endx)
2102         {
2103                 // if this pixel is masked off, it's probably not alone...
2104                 if (!pixelmask[x])
2105                 {
2106                         x++;
2107 #if 1
2108                         if (x + 8 < endx)
2109                         {
2110                                 // the 4-item search must be aligned or else it stalls badly
2111                                 if ((x & 3) && !pixelmask[x]) 
2112                                 {
2113                                         if(pixelmask[x]) goto endmasked;
2114                                         x++;
2115                                         if (x & 3)
2116                                         {
2117                                                 if(pixelmask[x]) goto endmasked;
2118                                                 x++;
2119                                                 if (x & 3)
2120                                                 {
2121                                                         if(pixelmask[x]) goto endmasked;
2122                                                         x++;
2123                                                 }
2124                                         }
2125                                 }
2126                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2127                                         x += 4;
2128                         }
2129 #endif
2130                         for (;!pixelmask[x];x++)
2131                                 ;
2132                         // rather than continue the loop, just check the end variable
2133                         if (x >= endx)
2134                                 break;
2135                 }
2136         endmasked:
2137                 // find length of subspan
2138                 subx = x + 1;
2139 #if 1
2140                 if (subx + 8 < endx)
2141                 {
2142                         if (subx & 3)
2143                         {
2144                                 if(!pixelmask[subx]) goto endunmasked;
2145                                 subx++;
2146                                 if (subx & 3)
2147                                 {
2148                                         if(!pixelmask[subx]) goto endunmasked;
2149                                         subx++;
2150                                         if (subx & 3)
2151                                         {
2152                                                 if(!pixelmask[subx]) goto endunmasked;
2153                                                 subx++;
2154                                         }
2155                                 }
2156                         }
2157                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2158                                 subx += 4;
2159                 }
2160 #endif
2161                 for (;pixelmask[subx];subx++)
2162                         ;
2163                 // the checks can overshoot, so make sure to clip it...
2164                 if (subx > endx)
2165                         subx = endx;
2166         endunmasked:
2167                 // now that we know the subspan length...  process!
2168                 switch(thread->fb_blendmode)
2169                 {
2170                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2171 #if 0
2172                         if (subx - x >= 16)
2173                         {
2174                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2175                                 x = subx;
2176                         }
2177                         else
2178 #elif 1
2179                         while (x + 16 <= subx)
2180                         {
2181                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2182                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2183                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2184                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2185                                 x += 16;
2186                         }
2187 #endif
2188                         {
2189                                 while (x + 4 <= subx)
2190                                 {
2191                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2192                                         x += 4;
2193                                 }
2194                                 if (x + 2 <= subx)
2195                                 {
2196                                         pixeli[x] = ini[x];
2197                                         pixeli[x+1] = ini[x+1];
2198                                         x += 2;
2199                                 }
2200                                 if (x < subx)
2201                                 {
2202                                         pixeli[x] = ini[x];
2203                                         x++;
2204                                 }
2205                         }
2206                         break;
2207                 case DPSOFTRAST_BLENDMODE_ALPHA:
2208                 #define FINISHBLEND(blend2, blend1) \
2209                         for (;x + 1 < subx;x += 2) \
2210                         { \
2211                                 __m128i src, dst; \
2212                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2213                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2214                                 blend2; \
2215                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2216                         } \
2217                         if (x < subx) \
2218                         { \
2219                                 __m128i src, dst; \
2220                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2222                                 blend1; \
2223                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2224                                 x++; \
2225                         }
2226                         FINISHBLEND({
2227                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2228                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2229                         }, {
2230                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2231                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2232                         });
2233                         break;
2234                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2235                         FINISHBLEND({
2236                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2238                         }, {
2239                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2241                         });
2242                         break;
2243                 case DPSOFTRAST_BLENDMODE_ADD:
2244                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2245                         break;
2246                 case DPSOFTRAST_BLENDMODE_INVMOD:
2247                         FINISHBLEND({
2248                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2249                         }, {
2250                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2251                         });
2252                         break;
2253                 case DPSOFTRAST_BLENDMODE_MUL:
2254                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2255                         break;
2256                 case DPSOFTRAST_BLENDMODE_MUL2:
2257                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2258                         break;
2259                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2260                         FINISHBLEND({
2261                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2263                         }, {
2264                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2266                         });
2267                         break;
2268                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2269                         FINISHBLEND({
2270                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2272                         }, {
2273                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2275                         });
2276                         break;
2277                 case DPSOFTRAST_BLENDMODE_INVADD:
2278                         FINISHBLEND({
2279                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2280                         }, {
2281                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2282                         });
2283                         break;
2284                 }
2285         }
2286 #endif
2287 }
2288
2289 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2290         // warning: this is SLOW, only use if the optimized per-span functions won't do
2291 {
2292         const unsigned char * RESTRICT pixelbase;
2293         const unsigned char * RESTRICT pixel[4];
2294         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2295         int wrapmask[2] = { width-1, height-1 };
2296         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2297         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2298         {
2299                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2300                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2301                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2302                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2303                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2304                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2305                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2306                 {
2307                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2308                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2309                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2310                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2311                 }
2312                 else
2313                 {
2314                         tci[0] &= wrapmask[0];
2315                         tci[1] &= wrapmask[1];
2316                         tci1[0] &= wrapmask[0];
2317                         tci1[1] &= wrapmask[1];
2318                 }
2319                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2320                 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2321                 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2322                 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2323                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2324                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2325                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2326                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2327         }
2328         else
2329         {
2330                 int tci[2] = { x * width, y * height };
2331                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2332                 {
2333                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2334                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2335                 }
2336                 else
2337                 {
2338                         tci[0] &= wrapmask[0];
2339                         tci[1] &= wrapmask[1];
2340                 }
2341                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2342                 c[0] = pixel[0][0];
2343                 c[1] = pixel[0][1];
2344                 c[2] = pixel[0][2];
2345                 c[3] = pixel[0][3];
2346         }
2347 }
2348
2349 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2350 {
2351         int x;
2352         int startx = span->startx;
2353         int endx = span->endx;
2354         int flags;
2355         float c[4];
2356         float data[4];
2357         float slope[4];
2358         float tc[2], endtc[2];
2359         float tcscale[2];
2360         unsigned int tci[2];
2361         unsigned int tci1[2];
2362         unsigned int tcimin[2];
2363         unsigned int tcimax[2];
2364         int tciwrapmask[2];
2365         int tciwidth;
2366         int filter;
2367         int mip;
2368         const unsigned char * RESTRICT pixelbase;
2369         const unsigned char * RESTRICT pixel[4];
2370         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2371         // if no texture is bound, just fill it with white
2372         if (!texture)
2373         {
2374                 for (x = startx;x < endx;x++)
2375                 {
2376                         out4f[x*4+0] = 1.0f;
2377                         out4f[x*4+1] = 1.0f;
2378                         out4f[x*4+2] = 1.0f;
2379                         out4f[x*4+3] = 1.0f;
2380                 }
2381                 return;
2382         }
2383         mip = triangle->mip[texunitindex];
2384         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2385         // if this mipmap of the texture is 1 pixel, just fill it with that color
2386         if (texture->mipmap[mip][1] == 4)
2387         {
2388                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2389                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2390                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2391                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2392                 for (x = startx;x < endx;x++)
2393                 {
2394                         out4f[x*4+0] = c[0];
2395                         out4f[x*4+1] = c[1];
2396                         out4f[x*4+2] = c[2];
2397                         out4f[x*4+3] = c[3];
2398                 }
2399                 return;
2400         }
2401         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2402         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2403         flags = texture->flags;
2404         tcscale[0] = texture->mipmap[mip][2];
2405         tcscale[1] = texture->mipmap[mip][3];
2406         tciwidth = texture->mipmap[mip][2];
2407         tcimin[0] = 0;
2408         tcimin[1] = 0;
2409         tcimax[0] = texture->mipmap[mip][2]-1;
2410         tcimax[1] = texture->mipmap[mip][3]-1;
2411         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2412         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2413         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2414         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2415         if (filter)
2416         {
2417                 endtc[0] -= 0.5f;
2418                 endtc[1] -= 0.5f;
2419         }
2420         for (x = startx;x < endx;)
2421         {
2422                 unsigned int subtc[2];
2423                 unsigned int substep[2];
2424                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2425                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2426                 if (nextsub >= endx)
2427                 {
2428                         nextsub = endsub = endx-1;      
2429                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2430                 }
2431                 tc[0] = endtc[0];
2432                 tc[1] = endtc[1];
2433                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2434                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2435                 if (filter)
2436                 {
2437                         endtc[0] -= 0.5f;
2438                         endtc[1] -= 0.5f;
2439                 }
2440                 substep[0] = (endtc[0] - tc[0]) * subscale;
2441                 substep[1] = (endtc[1] - tc[1]) * subscale;
2442                 subtc[0] = tc[0] * (1<<12);
2443                 subtc[1] = tc[1] * (1<<12);
2444                 if (filter)
2445                 {
2446                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2447                         {
2448                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2449                                 {
2450                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2451                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2452                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2453                                         tci[0] = subtc[0]>>12;
2454                                         tci[1] = subtc[1]>>12;
2455                                         tci1[0] = tci[0] + 1;
2456                                         tci1[1] = tci[1] + 1;
2457                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2458                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2459                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2460                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2461                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2462                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2463                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2464                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2465                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2466                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2467                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2468                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2469                                         out4f[x*4+0] = c[0];
2470                                         out4f[x*4+1] = c[1];
2471                                         out4f[x*4+2] = c[2];
2472                                         out4f[x*4+3] = c[3];
2473                                 }
2474                         }
2475                         else
2476                         {
2477                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2478                                 {
2479                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2480                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2481                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2482                                         tci[0] = subtc[0]>>12;
2483                                         tci[1] = subtc[1]>>12;
2484                                         tci1[0] = tci[0] + 1;
2485                                         tci1[1] = tci[1] + 1;
2486                                         tci[0] &= tciwrapmask[0];
2487                                         tci[1] &= tciwrapmask[1];
2488                                         tci1[0] &= tciwrapmask[0];
2489                                         tci1[1] &= tciwrapmask[1];
2490                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2491                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2492                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2493                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2494                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2495                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2496                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2497                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2498                                         out4f[x*4+0] = c[0];
2499                                         out4f[x*4+1] = c[1];
2500                                         out4f[x*4+2] = c[2];
2501                                         out4f[x*4+3] = c[3];
2502                                 }
2503                         }
2504                 }
2505                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2506                 {
2507                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2508                         {
2509                                 tci[0] = subtc[0]>>12;
2510                                 tci[1] = subtc[1]>>12;
2511                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2512                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2513                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2514                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2515                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2516                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2517                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2518                                 out4f[x*4+0] = c[0];
2519                                 out4f[x*4+1] = c[1];
2520                                 out4f[x*4+2] = c[2];
2521                                 out4f[x*4+3] = c[3];
2522                         }
2523                 }
2524                 else
2525                 {
2526                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2527                         {
2528                                 tci[0] = subtc[0]>>12;
2529                                 tci[1] = subtc[1]>>12;
2530                                 tci[0] &= tciwrapmask[0];
2531                                 tci[1] &= tciwrapmask[1];
2532                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2533                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2534                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2535                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2536                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2537                                 out4f[x*4+0] = c[0];
2538                                 out4f[x*4+1] = c[1];
2539                                 out4f[x*4+2] = c[2];
2540                                 out4f[x*4+3] = c[3];
2541                         }
2542                 }
2543         }
2544 }
2545
2546 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2547 {
2548 #ifdef SSE_POSSIBLE
2549         int x;
2550         int startx = span->startx;
2551         int endx = span->endx;
2552         int flags;
2553         __m128 data, slope, tcscale;
2554         __m128i tcsize, tcmask, tcoffset, tcmax;
2555         __m128 tc, endtc;
2556         __m128i subtc, substep, endsubtc;
2557         int filter;
2558         int mip;
2559         int affine; // LordHavoc: optimized affine texturing case
2560         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2561         const unsigned char * RESTRICT pixelbase;
2562         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2563         // if no texture is bound, just fill it with white
2564         if (!texture)
2565         {
2566                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2567                 return;
2568         }
2569         mip = triangle->mip[texunitindex];
2570         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2571         // if this mipmap of the texture is 1 pixel, just fill it with that color
2572         if (texture->mipmap[mip][1] == 4)
2573         {
2574                 unsigned int k = *((const unsigned int *)pixelbase);
2575                 for (x = startx;x < endx;x++)
2576                         outi[x] = k;
2577                 return;
2578         }
2579         affine = zf[startx] == zf[endx-1];
2580         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2581         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2582         flags = texture->flags;
2583         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2584         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2585         tcscale = _mm_cvtepi32_ps(tcsize);
2586         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2587         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2588         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2589         if (filter)
2590                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2591         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2592         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2593         tcmax = _mm_packs_epi32(tcmask, tcmask);
2594         for (x = startx;x < endx;)
2595         {
2596                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2597                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2598                 if (nextsub >= endx || affine)
2599                 {
2600                         nextsub = endsub = endx-1;
2601                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2602                 }       
2603                 tc = endtc;
2604                 subtc = endsubtc;
2605                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2606                 if (filter)
2607                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2608                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2609                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2610                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2611                 substep = _mm_slli_epi32(substep, 1);
2612                 if (filter)
2613                 {
2614                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2615                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2616                         {
2617                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2618                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2619                                 {
2620                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2621                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2622                                         tci = _mm_madd_epi16(tci, tcoffset);
2623                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2624                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2625                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2626                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2627                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2628                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2629                                         fracm = _mm_srli_epi16(subtc, 1);
2630                                         pix1 = _mm_add_epi16(pix1,
2631                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2632                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2633                                         pix3 = _mm_add_epi16(pix3,
2634                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2635                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2636                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2637                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2638                                         pix2 = _mm_add_epi16(pix2,
2639                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2640                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2641                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2642                                 }
2643                                 if (x <= endsub)
2644                                 {
2645                                         const unsigned char * RESTRICT ptr1;
2646                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2647                                         tci = _mm_madd_epi16(tci, tcoffset);
2648                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2649                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2650                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2651                                         fracm = _mm_srli_epi16(subtc, 1);
2652                                         pix1 = _mm_add_epi16(pix1,
2653                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2654                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2655                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2656                                         pix1 = _mm_add_epi16(pix1,
2657                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2658                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2659                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2660                                         x++;
2661                                 }
2662                         }
2663                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2664                         {
2665                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2666                                 {
2667                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2668                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2669                                         tci = _mm_madd_epi16(tci, tcoffset);
2670                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2671                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2672                                                                                         _mm_setzero_si128());
2673                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2674                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2675                                                                                         _mm_setzero_si128());
2676                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2677                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2678                                         tci = _mm_madd_epi16(tci, tcoffset);
2679                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2680                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2681                                                                                         _mm_setzero_si128());
2682                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2683                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2684                                                                                         _mm_setzero_si128());
2685                                         fracm = _mm_srli_epi16(subtc, 1);
2686                                         pix1 = _mm_add_epi16(pix1,
2687                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2688                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2689                                         pix3 = _mm_add_epi16(pix3,
2690                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2691                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2692                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2693                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2694                                         pix2 = _mm_add_epi16(pix2,
2695                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2696                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2697                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2698                                 }
2699                                 if (x <= endsub)
2700                                 {
2701                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2702                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2703                                         tci = _mm_madd_epi16(tci, tcoffset);
2704                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2705                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2706                                                                                         _mm_setzero_si128());
2707                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2708                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2709                                                                                         _mm_setzero_si128());
2710                                         fracm = _mm_srli_epi16(subtc, 1);
2711                                         pix1 = _mm_add_epi16(pix1,
2712                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2713                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2714                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2715                                         pix1 = _mm_add_epi16(pix1,
2716                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2717                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2718                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2719                                         x++;
2720                                 }
2721                         }
2722                         else
2723                         {
2724                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2725                                 {
2726                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2727                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2728                                         tci = _mm_madd_epi16(tci, tcoffset);
2729                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2730                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2731                                                                                         _mm_setzero_si128());
2732                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2733                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2734                                                                                         _mm_setzero_si128());
2735                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2736                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2737                                         tci = _mm_madd_epi16(tci, tcoffset);
2738                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2739                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2740                                                                                         _mm_setzero_si128());
2741                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2742                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2743                                                                                         _mm_setzero_si128());
2744                                         fracm = _mm_srli_epi16(subtc, 1);
2745                                         pix1 = _mm_add_epi16(pix1,
2746                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2747                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2748                                         pix3 = _mm_add_epi16(pix3,
2749                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2750                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2751                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2752                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2753                                         pix2 = _mm_add_epi16(pix2,
2754                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2755                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2756                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2757                                 }
2758                                 if (x <= endsub)
2759                                 {
2760                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2761                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2762                                         tci = _mm_madd_epi16(tci, tcoffset);
2763                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2764                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2765                                                                                         _mm_setzero_si128());
2766                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2767                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2768                                                                                         _mm_setzero_si128());
2769                                         fracm = _mm_srli_epi16(subtc, 1);
2770                                         pix1 = _mm_add_epi16(pix1,
2771                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2772                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2773                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2774                                         pix1 = _mm_add_epi16(pix1,
2775                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2776                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2777                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2778                                         x++;
2779                                 }
2780                         }
2781                 }
2782                 else
2783                 {
2784                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2785                         {
2786                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2787                                 {
2788                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2789                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2790                                         tci = _mm_madd_epi16(tci, tcoffset);
2791                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2792                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2793                                 }
2794                                 if (x <= endsub)
2795                                 {
2796                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2797                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2798                                         tci = _mm_madd_epi16(tci, tcoffset);
2799                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2800                                         x++;
2801                                 }
2802                         }
2803                         else
2804                         {
2805                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2806                                 {
2807                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2808                                         tci = _mm_and_si128(tci, tcmax); 
2809                                         tci = _mm_madd_epi16(tci, tcoffset);
2810                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2811                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2812                                 }
2813                                 if (x <= endsub)
2814                                 {
2815                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2816                                         tci = _mm_and_si128(tci, tcmax); 
2817                                         tci = _mm_madd_epi16(tci, tcoffset);
2818                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2819                                         x++;
2820                                 }
2821                         }
2822                 }
2823         }
2824 #endif
2825 }
2826
2827 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2828 {
2829         // TODO: IMPLEMENT
2830         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2831 }
2832
2833 float DPSOFTRAST_SampleShadowmap(const float *vector)
2834 {
2835         // TODO: IMPLEMENT
2836         return 1.0f;
2837 }
2838
2839 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2840 {
2841         int x;
2842         int startx = span->startx;
2843         int endx = span->endx;
2844         float c[4];
2845         float data[4];
2846         float slope[4];
2847         float z;
2848         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2849         for (x = startx;x < endx;x++)
2850         {
2851                 z = zf[x];
2852                 c[0] = (data[0] + slope[0]*x) * z;
2853                 c[1] = (data[1] + slope[1]*x) * z;
2854                 c[2] = (data[2] + slope[2]*x) * z;
2855                 c[3] = (data[3] + slope[3]*x) * z;
2856                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2857                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2858                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2859                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2860         }
2861 }
2862
2863 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2864 {
2865         int x;
2866         int startx = span->startx;
2867         int endx = span->endx;
2868         float c[4];
2869         float data[4];
2870         float slope[4];
2871         float z;
2872         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2873         for (x = startx;x < endx;x++)
2874         {
2875                 z = zf[x];
2876                 c[0] = (data[0] + slope[0]*x) * z;
2877                 c[1] = (data[1] + slope[1]*x) * z;
2878                 c[2] = (data[2] + slope[2]*x) * z;
2879                 c[3] = (data[3] + slope[3]*x) * z;
2880                 out4f[x*4+0] = c[0];
2881                 out4f[x*4+1] = c[1];
2882                 out4f[x*4+2] = c[2];
2883                 out4f[x*4+3] = c[3];
2884         }
2885 }
2886
2887 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2888 {
2889         int x, startx = span->startx, endx = span->endx;
2890         float c[4], localcolor[4];
2891         localcolor[0] = subcolor[0];
2892         localcolor[1] = subcolor[1];
2893         localcolor[2] = subcolor[2];
2894         localcolor[3] = subcolor[3];
2895         for (x = startx;x < endx;x++)
2896         {
2897                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2898                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2899                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2900                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2901                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2902                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2903                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2904                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2905         }
2906 }
2907
2908 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2909 {
2910         int x, startx = span->startx, endx = span->endx;
2911         for (x = startx;x < endx;x++)
2912         {
2913                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2914                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2915                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2916                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2917         }
2918 }
2919
2920 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2921 {
2922         int x, startx = span->startx, endx = span->endx;
2923         for (x = startx;x < endx;x++)
2924         {
2925                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2926                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2927                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2928                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2929         }
2930 }
2931
2932 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2933 {
2934         int x, startx = span->startx, endx = span->endx;
2935         float a, b;
2936         for (x = startx;x < endx;x++)
2937         {
2938                 a = 1.0f - inb4f[x*4+3];
2939                 b = inb4f[x*4+3];
2940                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2941                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2942                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2943                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2944         }
2945 }
2946
2947 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2948 {
2949         int x, startx = span->startx, endx = span->endx;
2950         float localcolor[4], ilerp, lerp;
2951         localcolor[0] = color[0];
2952         localcolor[1] = color[1];
2953         localcolor[2] = color[2];
2954         localcolor[3] = color[3];
2955         ilerp = 1.0f - localcolor[3];
2956         lerp = localcolor[3];
2957         for (x = startx;x < endx;x++)
2958         {
2959                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2960                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2961                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2962                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2963         }
2964 }
2965
2966
2967
2968 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2969 {
2970 #ifdef SSE_POSSIBLE
2971         int x;
2972         int startx = span->startx;
2973         int endx = span->endx;
2974         __m128 data, slope;
2975         __m128 mod, endmod;
2976         __m128i submod, substep, endsubmod;
2977         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2978         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2979         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2980         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2981         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2982         for (x = startx; x < endx;)
2983         {
2984                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2985                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2986                 if (nextsub >= endx)
2987                 {
2988                         nextsub = endsub = endx-1;
2989                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2990                 }
2991                 mod = endmod;
2992                 submod = endsubmod;
2993                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2994                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2995                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2996                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2997                 substep = _mm_packs_epi32(substep, substep);
2998                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2999                 {
3000                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3001                         pix = _mm_mulhi_epu16(pix, submod);
3002                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3003                 }
3004                 if (x <= endsub)
3005                 {
3006                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3007                         pix = _mm_mulhi_epu16(pix, submod);
3008                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3009                         x++;
3010                 }
3011         }
3012 #endif
3013 }
3014
3015 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3016 {
3017 #ifdef SSE_POSSIBLE
3018         int x;
3019         int startx = span->startx;
3020         int endx = span->endx;
3021         __m128 data, slope;
3022         __m128 mod, endmod;
3023         __m128i submod, substep, endsubmod;
3024         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3025         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3026         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3027         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3028         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3029         for (x = startx; x < endx;)
3030         {
3031                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3032                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3033                 if (nextsub >= endx)
3034                 {
3035                         nextsub = endsub = endx-1;
3036                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3037                 }
3038                 mod = endmod;
3039                 submod = endsubmod;
3040                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3041                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3042                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3043                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3044                 substep = _mm_packs_epi32(substep, substep);
3045                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3046                 {
3047                         __m128i pix = _mm_srai_epi16(submod, 4);
3048                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3049                 }
3050                 if (x <= endsub)
3051                 {
3052                         __m128i pix = _mm_srai_epi16(submod, 4);
3053                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3054                         x++;
3055                 }
3056         }
3057 #endif
3058 }
3059
3060 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3061 {
3062 #ifdef SSE_POSSIBLE
3063         int x, startx = span->startx, endx = span->endx;
3064         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3065         localcolor = _mm_packs_epi32(localcolor, localcolor);
3066         for (x = startx;x+2 <= endx;x+=2)
3067         {
3068                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3069                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3070                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3071                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3072         }
3073         if (x < endx)
3074         {
3075                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3076                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3077                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3078                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3079         }
3080 #endif
3081 }
3082
3083 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3084 {
3085 #ifdef SSE_POSSIBLE
3086         int x, startx = span->startx, endx = span->endx;
3087         for (x = startx;x+2 <= endx;x+=2)
3088         {
3089                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3090                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3091                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3092                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3093         }
3094         if (x < endx)
3095         {
3096                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3097                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3098                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3099                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3100         }
3101 #endif
3102 }
3103
3104 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3105 {
3106 #ifdef SSE_POSSIBLE
3107         int x, startx = span->startx, endx = span->endx;
3108         for (x = startx;x+2 <= endx;x+=2)
3109         {
3110                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3111                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3112                 pix1 = _mm_add_epi16(pix1, pix2);
3113                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3114         }
3115         if (x < endx)
3116         {
3117                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3118                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3119                 pix1 = _mm_add_epi16(pix1, pix2);
3120                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3121         }
3122 #endif
3123 }
3124
3125 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3126 {
3127 #ifdef SSE_POSSIBLE
3128         int x, startx = span->startx, endx = span->endx;
3129         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3130         tint = _mm_packs_epi32(tint, tint);
3131         for (x = startx;x+2 <= endx;x+=2)
3132         {
3133                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3134                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3135                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3136                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3137         }
3138         if (x < endx)
3139         {
3140                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3141                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3142                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3143                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3144         }
3145 #endif
3146 }
3147
3148 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3149 {
3150 #ifdef SSE_POSSIBLE
3151         int x, startx = span->startx, endx = span->endx;
3152         for (x = startx;x+2 <= endx;x+=2)
3153         {
3154                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3155                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3156                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3157                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3158                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3159         }
3160         if (x < endx)
3161         {
3162                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3163                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3164                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3165                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3166                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3167         }
3168 #endif
3169 }
3170
3171 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3172 {
3173 #ifdef SSE_POSSIBLE
3174         int x, startx = span->startx, endx = span->endx;
3175         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3176         localcolor = _mm_packs_epi32(localcolor, localcolor);
3177         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3178         for (x = startx;x+2 <= endx;x+=2)
3179         {
3180                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3181                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3182                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3183         }
3184         if (x < endx)
3185         {
3186                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3187                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3188                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3189         }
3190 #endif
3191 }
3192
3193
3194
3195 void DPSOFTRAST_VertexShader_Generic(void)
3196 {
3197         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3198         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3199         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3200         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3201                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3202 }
3203
3204 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3205 {
3206         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3207         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3208         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3209         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3210         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3211         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3212         {
3213                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3214                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3215                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3216                 {
3217                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3218                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3219                         {
3220                                 // multiply
3221                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3222                         }
3223                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3224                         {
3225                                 // add
3226                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3227                         }
3228                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3229                         {
3230                                 // alphablend
3231                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3232                         }
3233                 }
3234         }
3235         else
3236                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3237         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3238 }
3239
3240
3241
3242 void DPSOFTRAST_VertexShader_PostProcess(void)
3243 {
3244         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3245         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3246         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3247 }
3248
3249 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3250 {
3251         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3252         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3253         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3254         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3255         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3256         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3257         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3258         {
3259                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3260                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3261         }
3262         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3263         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3264         {
3265                 // TODO: implement saturation
3266         }
3267         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3268         {
3269                 // TODO: implement gammaramps
3270         }
3271         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3272 }
3273
3274
3275
3276 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3277 {
3278         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3279 }
3280
3281 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3282 {
3283         // this is never called (because colormask is off when this shader is used)
3284         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3285         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3286         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3287         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3288         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3289 }
3290
3291
3292
3293 void DPSOFTRAST_VertexShader_FlatColor(void)
3294 {
3295         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3296         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3297 }
3298
3299 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3300 {
3301 #ifdef SSE_POSSIBLE
3302         unsigned char * RESTRICT pixelmask = span->pixelmask;
3303         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3304         int x, startx = span->startx, endx = span->endx;
3305         __m128i Color_Ambientm;
3306         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3307         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3308         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3309         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3310         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3311         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3312                 pixel = buffer_FragColorbgra8;
3313         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3314         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3315         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3316         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3317         for (x = startx;x < endx;x++)
3318         {
3319                 __m128i color, pix;
3320                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3321                 {
3322                         __m128i pix2;
3323                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3324                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3325                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3326                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3327                         x += 3;
3328                         continue;
3329                 }
3330                 if (!pixelmask[x])
3331                         continue;
3332                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3333                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3334                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3335         }
3336         if (pixel == buffer_FragColorbgra8)
3337                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3338 #endif
3339 }
3340
3341
3342
3343 void DPSOFTRAST_VertexShader_VertexColor(void)
3344 {
3345         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3346         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3347         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3348 }
3349
3350 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3351 {
3352 #ifdef SSE_POSSIBLE
3353         unsigned char * RESTRICT pixelmask = span->pixelmask;
3354         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3355         int x, startx = span->startx, endx = span->endx;
3356         __m128i Color_Ambientm, Color_Diffusem;
3357         __m128 data, slope;
3358         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3359         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3360         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3361         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3362         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3363         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3364         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3365                 pixel = buffer_FragColorbgra8;
3366         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3367         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3368         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3369         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3370         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3371         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3372         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3373         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3374         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3375         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3376         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3377         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3378         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3379         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3380         {
3381                 __m128i color, mod, pix;
3382                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3383                 {
3384                         __m128i pix2, mod2;
3385                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3386                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3387                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3388                         data = _mm_add_ps(data, slope);
3389                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3390                         data = _mm_add_ps(data, slope);
3391                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3392                         data = _mm_add_ps(data, slope);
3393                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3394                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3395                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3396                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3397                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3398                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3399                         x += 3;
3400                         continue;
3401                 }
3402                 if (!pixelmask[x])
3403                         continue;
3404                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3405                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3406                 mod = _mm_packs_epi32(mod, mod);
3407                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3408                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3409         }
3410         if (pixel == buffer_FragColorbgra8)
3411                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3412 #endif
3413 }
3414
3415
3416
3417 void DPSOFTRAST_VertexShader_Lightmap(void)
3418 {
3419         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3420         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3421         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3422 }
3423
3424 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3425 {
3426 #ifdef SSE_POSSIBLE
3427         unsigned char * RESTRICT pixelmask = span->pixelmask;
3428         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3429         int x, startx = span->startx, endx = span->endx;
3430         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3431         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3432         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3433         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3434         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3435         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3436         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3437         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3438         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3439         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3440                 pixel = buffer_FragColorbgra8;
3441         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3442         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3443         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3444         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3445         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3446         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3447         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3448         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3449         {
3450                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3451                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3452                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3453                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3454                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3455                 for (x = startx;x < endx;x++)
3456                 {
3457                         __m128i color, lightmap, glow, pix;
3458                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3459                         {
3460                                 __m128i pix2;
3461                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3462                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3463                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3464                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3465                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3466                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3467                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3468                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3469                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3470                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3471                                 x += 3;
3472                                 continue;
3473                         }
3474                         if (!pixelmask[x])
3475                                 continue;
3476                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3477                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3478                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3479                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3480                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3481                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3482                 }
3483         }
3484         else
3485         {
3486                 for (x = startx;x < endx;x++)
3487                 {
3488                         __m128i color, lightmap, pix;
3489                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3490                         {
3491                                 __m128i pix2;
3492                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3493                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3494                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3495                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3496                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3497                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3498                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3499                                 x += 3;
3500                                 continue;
3501                         }
3502                         if (!pixelmask[x]) 
3503                                 continue;
3504                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3505                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3506                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3507                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3508                 }
3509         }
3510         if (pixel == buffer_FragColorbgra8)
3511                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3512 #endif
3513 }
3514
3515
3516 void DPSOFTRAST_VertexShader_LightDirection(void);
3517 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3518
3519 void DPSOFTRAST_VertexShader_FakeLight(void)
3520 {
3521         DPSOFTRAST_VertexShader_LightDirection();
3522 }
3523
3524 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3525 {
3526         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3527 }
3528
3529
3530
3531 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3532 {
3533         DPSOFTRAST_VertexShader_LightDirection();
3534         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3535 }
3536
3537 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3538 {
3539         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3540 }
3541
3542
3543
3544 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3545 {
3546         DPSOFTRAST_VertexShader_LightDirection();
3547         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3548 }
3549
3550 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3551 {
3552         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3553 }
3554
3555
3556
3557 void DPSOFTRAST_VertexShader_LightDirection(void)
3558 {
3559         int i;
3560         int numvertices = dpsoftrast.numvertices;
3561         float LightDir[4];
3562         float LightVector[4];
3563         float EyePosition[4];
3564         float EyeVectorModelSpace[4];
3565         float EyeVector[4];
3566         float position[4];
3567         float svector[4];
3568         float tvector[4];
3569         float normal[4];
3570         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3571         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3572         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3573         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3574         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3575         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3576         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3577         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3578         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3579         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3580         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3581         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3582         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3583         for (i = 0;i < numvertices;i++)
3584         {
3585                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3586                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3587                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3588                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3589                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3590                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3591                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3592                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3593                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3594                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3595                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3596                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3597                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3598                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3599                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3600                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3601                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3602                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3603                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3604                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3605                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3606                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3607                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3608                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3609                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3610                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3611                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3612                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3613                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3614         }
3615         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3616 }
3617
3618 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3619 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3620 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3621 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3622 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3623 #define DPSOFTRAST_Vector3Normalize(v)\
3624 do\
3625 {\
3626         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3627         if (len)\
3628         {\
3629                 len = 1.0f / len;\
3630                 v[0] *= len;\
3631                 v[1] *= len;\
3632                 v[2] *= len;\
3633         }\
3634 }\
3635 while(0)
3636
3637 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3638 {
3639         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3640         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3641         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3642         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3643         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3644         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3645         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3646         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3647         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3648         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3649         int x, startx = span->startx, endx = span->endx;
3650         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3651         float LightVectordata[4];
3652         float LightVectorslope[4];
3653         float EyeVectordata[4];
3654         float EyeVectorslope[4];
3655         float VectorSdata[4];
3656         float VectorSslope[4];
3657         float VectorTdata[4];
3658         float VectorTslope[4];
3659         float VectorRdata[4];
3660         float VectorRslope[4];
3661         float z;
3662         float diffusetex[4];
3663         float glosstex[4];
3664         float surfacenormal[4];
3665         float lightnormal[4];
3666         float lightnormal_modelspace[4];
3667         float eyenormal[4];
3668         float specularnormal[4];
3669         float diffuse;
3670         float specular;
3671         float SpecularPower;
3672         int d[4];
3673         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3674         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3675         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3676         Color_Glow[3] = 0.0f;
3677         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3678         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3679         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3680         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3681         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3682         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3683         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3684         Color_Pants[3] = 0.0f;
3685         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3686         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3687         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3688         Color_Shirt[3] = 0.0f;
3689         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3690         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3691         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3692         {
3693                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3694                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3695         }
3696         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3697         {
3698                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3699         }
3700         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3701         {
3702                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3703                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3704                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3705                 Color_Diffuse[3] = 0.0f;
3706                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3707                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3708                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3709                 LightColor[3] = 0.0f;
3710                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3711                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3712                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3713                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3714                 Color_Specular[3] = 0.0f;
3715                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3716                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3717                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3718
3719                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3720                 {
3721                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3722                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3723                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3724                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3725                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3726                 }
3727                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3728                 {
3729                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3730                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3731                 }
3732                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3733                 {
3734                         // nothing of this needed
3735                 }
3736                 else
3737                 {
3738                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3739                 }
3740
3741                 for (x = startx;x < endx;x++)
3742                 {
3743                         z = buffer_z[x];
3744                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3745                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3746                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3747                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3748                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3749                         {
3750                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3751                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3752                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3753                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3754                         }
3755                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3756                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3757                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3758                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3759                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3760                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3761                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3762                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3763
3764                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3765                         {
3766                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3767                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3768                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3769                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3770
3771                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3772                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3773                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3774                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3775
3776                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3777                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3778                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3779                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3780
3781                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3782                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3783                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3784                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3785
3786                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3787                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3788
3789                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3790                                 {
3791                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3792                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3793                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3794                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3795                                 }
3796                         }
3797                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3798                         {
3799                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3800                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3801                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3802                                 {
3803                                         float f = 1.0f / 256.0f;
3804                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3805                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3806                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3807                                 }
3808                         }
3809                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3810                         {
3811                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3812                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3813                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3814                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3815
3816                                 LightColor[0] = 1.0;
3817                                 LightColor[1] = 1.0;
3818                                 LightColor[2] = 1.0;
3819                         }
3820                         else
3821                         {
3822                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3823                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3824                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3825                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3826                         }
3827
3828                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3829
3830                         if(thread->shader_exactspecularmath)
3831                         {
3832                                 // reflect lightnormal at surfacenormal, take the negative of that
3833                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3834                                 float f;
3835                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3836                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3837                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3838                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3839
3840                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3841                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3842                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3843                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3844                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3845
3846                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3847                         }
3848                         else
3849                         {
3850                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3851                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3852                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3853                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3854
3855                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3856                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3857                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3858                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3859
3860                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < DPSOFTRAST_FLT_MIN) specular = DPSOFTRAST_FLT_MIN;
3861                         }
3862
3863                         specular = pow(specular, SpecularPower * glosstex[3]);
3864                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3865                         {
3866                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3867                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3868                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3869                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3870                         }
3871                         else
3872                         {
3873                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3874                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3875                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3876                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3877                         }
3878
3879                         buffer_FragColorbgra8[x*4+0] = d[0];
3880                         buffer_FragColorbgra8[x*4+1] = d[1];
3881                         buffer_FragColorbgra8[x*4+2] = d[2];
3882                         buffer_FragColorbgra8[x*4+3] = d[3];
3883                 }
3884         }
3885         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3886         {
3887                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3888                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3889                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3890                 Color_Diffuse[3] = 0.0f;
3891                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3892                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3893                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3894                 LightColor[3] = 0.0f;
3895                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3896
3897                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3898                 {
3899                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3900                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3901                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3902                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3903                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3904                 }
3905                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3906                 {
3907                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3908                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3909                 }
3910                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3911                 {
3912                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3913                 }
3914                 else
3915                 {
3916                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3917                 }
3918
3919                 for (x = startx;x < endx;x++)
3920                 {
3921                         z = buffer_z[x];
3922                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3923                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3924                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3925                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3926                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3927                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3928                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3929                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3930
3931                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3932                         {
3933                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3934                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3935                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3936                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3937
3938                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3939                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3940                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3941                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3942
3943                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3944                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3945                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3946                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3947
3948                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3949                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3950                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3951                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3952
3953                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3954                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3955
3956                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3957                                 {
3958                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3959                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3960                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3961                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3962                                 }
3963                         }
3964                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3965                         {
3966                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3967                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3968                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3969                                 {
3970                                         float f = 1.0f / 256.0f;
3971                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3972                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3973                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3974                                 }
3975                         }
3976                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3977                         {
3978                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3979                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3980                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3981                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3982
3983                                 LightColor[0] = 1.0;
3984                                 LightColor[1] = 1.0;
3985                                 LightColor[2] = 1.0;
3986                         }
3987                         else
3988                         {
3989                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3990                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3991                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3992                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3993                         }
3994
3995                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3996                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3997                         {
3998                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3999                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4000                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4001                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4002                         }
4003                         else
4004                         {
4005                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4006                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4007                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4008                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4009                         }
4010                         buffer_FragColorbgra8[x*4+0] = d[0];
4011                         buffer_FragColorbgra8[x*4+1] = d[1];
4012                         buffer_FragColorbgra8[x*4+2] = d[2];
4013                         buffer_FragColorbgra8[x*4+3] = d[3];
4014                 }
4015         }
4016         else
4017         {
4018                 for (x = startx;x < endx;x++)
4019                 {
4020                         z = buffer_z[x];
4021                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4022                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4023                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4024                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4025
4026                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4027                         {
4028                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4029                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4030                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4031                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4032                         }
4033                         else
4034                         {
4035                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4036                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4037                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4038                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4039                         }
4040                         buffer_FragColorbgra8[x*4+0] = d[0];
4041                         buffer_FragColorbgra8[x*4+1] = d[1];
4042                         buffer_FragColorbgra8[x*4+2] = d[2];
4043                         buffer_FragColorbgra8[x*4+3] = d[3];
4044                 }
4045         }
4046         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4047 }
4048
4049
4050
4051 void DPSOFTRAST_VertexShader_LightSource(void)
4052 {
4053         int i;
4054         int numvertices = dpsoftrast.numvertices;
4055         float LightPosition[4];
4056         float LightVector[4];
4057         float LightVectorModelSpace[4];
4058         float EyePosition[4];
4059         float EyeVectorModelSpace[4];
4060         float EyeVector[4];
4061         float position[4];
4062         float svector[4];
4063         float tvector[4];
4064         float normal[4];
4065         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4066         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4067         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4068         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4069         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4070         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4071         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4072         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4073         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4074         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4075         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4076         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4077         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4078         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4079         for (i = 0;i < numvertices;i++)
4080         {
4081                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4082                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4083                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4084                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4085                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4086                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4087                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4088                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4089                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4090                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4091                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4092                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4093                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4094                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4095                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4096                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4097                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4098                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4099                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4100                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4101                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4102                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4103                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4104                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4105                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4106                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4107                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4108                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4109                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4110                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4111                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4112                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4113         }
4114         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4115         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4116 }
4117
4118 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4119 {
4120 #ifdef SSE_POSSIBLE
4121         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4122         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4123         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4124         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4125         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4126         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4127         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4128         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4129         int x, startx = span->startx, endx = span->endx;
4130         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4131         float CubeVectordata[4];
4132         float CubeVectorslope[4];
4133         float LightVectordata[4];
4134         float LightVectorslope[4];
4135         float EyeVectordata[4];
4136         float EyeVectorslope[4];
4137         float z;
4138         float diffusetex[4];
4139         float glosstex[4];
4140         float surfacenormal[4];
4141         float lightnormal[4];
4142         float eyenormal[4];
4143         float specularnormal[4];
4144         float diffuse;
4145         float specular;
4146         float SpecularPower;
4147         float CubeVector[4];
4148         float attenuation;
4149         int d[4];
4150         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4151         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4152         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4153         Color_Glow[3] = 0.0f;
4154         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4155         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4156         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4157         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4158         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4159         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4160         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4161         Color_Diffuse[3] = 0.0f;
4162         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4163         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4164         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4165         Color_Specular[3] = 0.0f;
4166         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4167         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4168         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4169         Color_Pants[3] = 0.0f;
4170         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4171         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4172         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4173         Color_Shirt[3] = 0.0f;
4174         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4175         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4176         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4177         LightColor[3] = 0.0f;
4178         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4179         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4180         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4181         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4182         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4183         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4184         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4185         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4186         {
4187                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4188                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4189         }
4190         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4191                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4192         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4193         {
4194                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4195                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4196                 for (x = startx;x < endx;x++)
4197                 {
4198                         z = buffer_z[x];
4199                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4200                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4201                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4202                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4203                         if (attenuation < 0.01f)
4204                                 continue;
4205                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4206                         {
4207                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4208                                 if (attenuation < 0.01f)
4209                                         continue;
4210                         }
4211
4212                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4213                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4214                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4215                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4216                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4217                         {
4218                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4219                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4220                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4221                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4222                         }
4223                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4224                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4225                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4226                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4227                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4228                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4229                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4230                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4231
4232                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4233                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4234                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4235                         DPSOFTRAST_Vector3Normalize(lightnormal);
4236
4237                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4238
4239                         if(thread->shader_exactspecularmath)
4240                         {
4241                                 // reflect lightnormal at surfacenormal, take the negative of that
4242                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4243                                 float f;
4244                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4245                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4246                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4247                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4248
4249                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4250                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4251                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4252                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4253                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4254
4255                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4256                         }
4257                         else
4258                         {
4259                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4260                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4261                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4262                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4263
4264                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4265                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4266                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4267                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4268
4269                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < DPSOFTRAST_FLT_MIN) specular = DPSOFTRAST_FLT_MIN;
4270                         }
4271                         specular = pow(specular, SpecularPower * glosstex[3]);
4272
4273                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4274                         {
4275                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4276                                 attenuation *= (1.0f / 255.0f);
4277                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4278                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4279                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4280                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4281                         }
4282                         else
4283                         {
4284                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4285                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4286                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4287                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4288                         }
4289                         buffer_FragColorbgra8[x*4+0] = d[0];
4290                         buffer_FragColorbgra8[x*4+1] = d[1];
4291                         buffer_FragColorbgra8[x*4+2] = d[2];
4292                         buffer_FragColorbgra8[x*4+3] = d[3];
4293                 }
4294         }
4295         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4296         {
4297                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4298                 for (x = startx;x < endx;x++)
4299                 {
4300                         z = buffer_z[x];
4301                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4302                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4303                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4304                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4305                         if (attenuation < 0.01f)
4306                                 continue;
4307                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4308                         {
4309                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4310                                 if (attenuation < 0.01f)
4311                                         continue;
4312                         }
4313
4314                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4315                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4316                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4317                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4318                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4319                         {
4320                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4321                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4322                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4323                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4324                         }
4325                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4326                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4327                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4328                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4329
4330                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4331                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4332                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4333                         DPSOFTRAST_Vector3Normalize(lightnormal);
4334
4335                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4336                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4337                         {
4338                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4339                                 attenuation *= (1.0f / 255.0f);
4340                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4341                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4342                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4343                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4344                         }
4345                         else
4346                         {
4347                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4348                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4349                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4350                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4351                         }
4352                         buffer_FragColorbgra8[x*4+0] = d[0];
4353                         buffer_FragColorbgra8[x*4+1] = d[1];
4354                         buffer_FragColorbgra8[x*4+2] = d[2];
4355                         buffer_FragColorbgra8[x*4+3] = d[3];
4356                 }
4357         }
4358         else
4359         {
4360                 for (x = startx;x < endx;x++)
4361                 {
4362                         z = buffer_z[x];
4363                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4364                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4365                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4366                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4367                         if (attenuation < 0.01f)
4368                                 continue;
4369                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4370                         {
4371                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4372                                 if (attenuation < 0.01f)
4373                                         continue;
4374                         }
4375
4376                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4377                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4378                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4379                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4380                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4381                         {
4382                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4383                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4384                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4385                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4386                         }
4387                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4388                         {
4389                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4390                                 attenuation *= (1.0f / 255.0f);
4391                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4392                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4393                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4394                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4395                         }
4396                         else
4397                         {
4398                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4399                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4400                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4401                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4402                         }
4403                         buffer_FragColorbgra8[x*4+0] = d[0];
4404                         buffer_FragColorbgra8[x*4+1] = d[1];
4405                         buffer_FragColorbgra8[x*4+2] = d[2];
4406                         buffer_FragColorbgra8[x*4+3] = d[3];
4407                 }
4408         }
4409         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4410 #endif
4411 }
4412
4413
4414
4415 void DPSOFTRAST_VertexShader_Refraction(void)
4416 {
4417         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4418         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4419         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4420 }
4421
4422 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4423 {
4424         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4425         float z;
4426         int x, startx = span->startx, endx = span->endx;
4427
4428         // texture reads
4429         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4430         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4431
4432         // varyings
4433         float ModelViewProjectionPositiondata[4];
4434         float ModelViewProjectionPositionslope[4];
4435
4436         // uniforms
4437         float ScreenScaleRefractReflect[2];
4438         float ScreenCenterRefractReflect[2];
4439         float DistortScaleRefractReflect[2];
4440         float RefractColor[4];
4441
4442         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4443         if(!texture) return;
4444
4445         // read textures
4446         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4447         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4448
4449         // read varyings
4450         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4451
4452         // read uniforms
4453         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4454         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4455         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4456         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4457         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4458         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4459         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4460         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4461         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4462         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4463
4464         // do stuff
4465         for (x = startx;x < endx;x++)
4466         {
4467                 float SafeScreenTexCoord[2];
4468                 float ScreenTexCoord[2];
4469                 float v[3];
4470                 float iw;
4471                 unsigned char c[4];
4472
4473                 z = buffer_z[x];
4474
4475                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4476                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4477
4478                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4479                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4480                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4481
4482                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4483                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4484                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4485                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4486                 DPSOFTRAST_Vector3Normalize(v);
4487                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4488                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4489
4490                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4491                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4492
4493                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4494                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4495                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4496                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4497         }
4498
4499         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4500 }
4501
4502
4503
4504 void DPSOFTRAST_VertexShader_Water(void)
4505 {
4506         int i;
4507         int numvertices = dpsoftrast.numvertices;
4508         float EyePosition[4];
4509         float EyeVectorModelSpace[4];
4510         float EyeVector[4];
4511         float position[4];
4512         float svector[4];
4513         float tvector[4];
4514         float normal[4];
4515         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4516         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4517         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4518         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4519         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4520         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4521         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4522         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4523         for (i = 0;i < numvertices;i++)
4524         {
4525                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4526                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4527                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4528                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4529                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4530                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4531                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4532                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4533                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4534                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4535                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4536                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4537                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4538                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4539                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4540                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4541                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4542                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4543                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4544                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4545                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4546                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4547         }
4548         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4549         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4550         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4551 }
4552
4553
4554 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4555 {
4556         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4557         float z;
4558         int x, startx = span->startx, endx = span->endx;
4559
4560         // texture reads
4561         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4562         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4563
4564         // varyings
4565         float ModelViewProjectionPositiondata[4];
4566         float ModelViewProjectionPositionslope[4];
4567         float EyeVectordata[4];
4568         float EyeVectorslope[4];
4569
4570         // uniforms
4571         float ScreenScaleRefractReflect[4];
4572         float ScreenCenterRefractReflect[4];
4573         float DistortScaleRefractReflect[4];
4574         float RefractColor[4];
4575         float ReflectColor[4];
4576         float ReflectFactor;
4577         float ReflectOffset;
4578
4579         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4580         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4581         if(!texture_refraction || !texture_reflection) return;
4582
4583         // read textures
4584         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4585         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4586
4587         // read varyings
4588         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4589         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4590
4591         // read uniforms
4592         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4593         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4594         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4595         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4596         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4597         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4598         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4599         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4600         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4601         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4602         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4603         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4604         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4605         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4606         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4607         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4608         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4609         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4610         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4611         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4612         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4613         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4614
4615         // do stuff
4616         for (x = startx;x < endx;x++)
4617         {
4618                 float SafeScreenTexCoord[4];
4619                 float ScreenTexCoord[4];
4620                 float v[3];
4621                 float iw;
4622                 unsigned char c1[4];
4623                 unsigned char c2[4];
4624                 float Fresnel;
4625
4626                 z = buffer_z[x];
4627
4628                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4629                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4630
4631                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4632                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4633                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4634                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4635                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4636
4637                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4638                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4639                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4640                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4641                 DPSOFTRAST_Vector3Normalize(v);
4642                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4643                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4644                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4645                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4646
4647                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4648                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4649                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4650                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4651                 DPSOFTRAST_Vector3Normalize(v);
4652                 Fresnel = 1.0f - v[2];
4653                 Fresnel = min(1.0f, Fresnel);
4654                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4655
4656                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4657                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4658                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4659                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4660
4661                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4662                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4663                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4664                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4665         }
4666
4667         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4668 }
4669
4670
4671
4672 void DPSOFTRAST_VertexShader_ShowDepth(void)
4673 {
4674         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4675 }
4676
4677 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4678 {
4679         // TODO: IMPLEMENT
4680         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4681         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4682         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4683         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4684         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4685 }
4686
4687
4688
4689 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4690 {
4691         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4692 }
4693
4694 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4695 {
4696         // TODO: IMPLEMENT
4697         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4698         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4699         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4700         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4701         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4702 }
4703
4704
4705
4706 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4707 {
4708         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4709 }
4710
4711 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4712 {
4713         // TODO: IMPLEMENT
4714         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4715         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4716         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4717         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4718         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4719 }
4720
4721
4722
4723 typedef struct DPSOFTRAST_ShaderModeInfo_s
4724 {
4725         int lodarrayindex;
4726         void (*Vertex)(void);
4727         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4728         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4729         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4730 }
4731 DPSOFTRAST_ShaderModeInfo;
4732
4733 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4734 {
4735         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4736         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4737         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4738         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4739         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4740         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4741         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4742         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4743         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4744         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4745         {2, DPSOFTRAST_VertexShader_VertexColor,                        DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4746         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4747         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4748         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4749         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4750         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4751         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4752         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4753 };
4754
4755 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4756 {
4757         int x;
4758         int startx;
4759         int endx;
4760         unsigned int *depthpixel;
4761         int depth;
4762         int depthslope;
4763         unsigned int d;
4764         unsigned char *pixelmask;
4765         DPSOFTRAST_State_Triangle *triangle;
4766         triangle = &thread->triangles[span->triangle];
4767         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4768         startx = span->startx;
4769         endx = span->endx;
4770         depth = span->depthbase;
4771         depthslope = span->depthslope;
4772         pixelmask = thread->pixelmaskarray;
4773         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4774         {
4775                 switch(thread->fb_depthfunc)
4776                 {
4777                 default:
4778                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4779                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4780                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4781                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4782                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4783                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4784                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4785                 }
4786                 while (startx < endx && !pixelmask[startx])
4787                         startx++;
4788                 while (endx > startx && !pixelmask[endx-1])
4789                         endx--;
4790         }
4791         else
4792         {
4793                 // no depth testing means we're just dealing with color...
4794                 memset(pixelmask + startx, 1, endx - startx);
4795         }
4796         span->pixelmask = pixelmask;
4797         span->startx = startx;
4798         span->endx = endx;
4799 }
4800
4801 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4802 {
4803         int x, d, depth, depthslope, startx, endx;
4804         const unsigned char *pixelmask;
4805         unsigned int *depthpixel;
4806         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4807         {
4808                 depth = span->depthbase;
4809                 depthslope = span->depthslope;
4810                 pixelmask = span->pixelmask;
4811                 startx = span->startx;
4812                 endx = span->endx;
4813                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4814                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4815                         if (pixelmask[x])
4816                                 depthpixel[x] = d;
4817         }
4818 }
4819
4820 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4821 {
4822         int i;
4823         DPSOFTRAST_State_Triangle *triangle;
4824         DPSOFTRAST_State_Span *span;
4825         for (i = 0; i < thread->numspans; i++)
4826         {
4827                 span = &thread->spans[i];
4828                 triangle = &thread->triangles[span->triangle];
4829                 DPSOFTRAST_Draw_DepthTest(thread, span);
4830                 if (span->startx >= span->endx)
4831                         continue;
4832                 // run pixel shader if appropriate
4833                 // do this before running depthmask code, to allow the pixelshader
4834                 // to clear pixelmask values for alpha testing
4835                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4836                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4837                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4838         }
4839         thread->numspans = 0;
4840 }
4841
4842 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4843
4844 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4845 {
4846 #ifdef SSE_POSSIBLE
4847         int cullface = thread->cullface;
4848         int minx, maxx, miny, maxy;
4849         int miny1, maxy1, miny2, maxy2;
4850         __m128i fbmin, fbmax;
4851         __m128 viewportcenter, viewportscale;
4852         int firstvertex = command->firstvertex;
4853         int numvertices = command->numvertices;
4854         int numtriangles = command->numtriangles;
4855         const int *element3i = command->element3i;
4856         const unsigned short *element3s = command->element3s;
4857         int clipped = command->clipped;
4858         int i;
4859         int j;
4860         int k;
4861         int y;
4862         int e[3];
4863         __m128i screeny;
4864         int starty, endy, bandy;
4865         int numpoints;
4866         int clipcase;
4867         float clipdist[4];
4868         float clip0origin, clip0slope;
4869         int clip0dir;
4870         __m128 triangleedge1, triangleedge2, trianglenormal;
4871         __m128 clipfrac[3];
4872         __m128 screen[4];
4873         DPSOFTRAST_State_Triangle *triangle;
4874         DPSOFTRAST_Texture *texture;
4875         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4876         miny = thread->fb_scissor[1];
4877         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4878         miny1 = bound(miny, thread->miny1, maxy);
4879         maxy1 = bound(miny, thread->maxy1, maxy);
4880         miny2 = bound(miny, thread->miny2, maxy);
4881         maxy2 = bound(miny, thread->maxy2, maxy);
4882         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4883         {
4884                 if (!ATOMIC_DECREMENT(command->refcount))
4885                 {
4886                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4887                                 MM_FREE(command->arrays);
4888                 }
4889                 return;
4890         }
4891         minx = thread->fb_scissor[0];
4892         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4893         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4894         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4895         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4896         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4897         screen[3] = _mm_setzero_ps();
4898         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4899         for (i = 0;i < numtriangles;i++)
4900         {
4901                 const float *screencoord4f = command->arrays;
4902                 const float *arrays = screencoord4f + numvertices*4;
4903
4904                 // generate the 3 edges of this triangle
4905                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4906                 if (element3s)
4907                 {
4908                         e[0] = element3s[i*3+0] - firstvertex;
4909                         e[1] = element3s[i*3+1] - firstvertex;
4910                         e[2] = element3s[i*3+2] - firstvertex;
4911                 }
4912                 else if (element3i)
4913                 {
4914                         e[0] = element3i[i*3+0] - firstvertex;
4915                         e[1] = element3i[i*3+1] - firstvertex;
4916                         e[2] = element3i[i*3+2] - firstvertex;
4917                 }
4918                 else
4919                 {
4920                         e[0] = i*3+0;
4921                         e[1] = i*3+1;
4922                         e[2] = i*3+2;
4923                 }
4924
4925 #define SKIPBACKFACE \
4926                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4927                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4928                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4929                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4930                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4931                 switch(cullface) \
4932                 { \
4933                 case GL_BACK: \
4934                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4935                                 continue; \
4936                         break; \
4937                 case GL_FRONT: \
4938                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4939                                 continue; \
4940                         break; \
4941                 }
4942
4943 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4944                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4945                         { \
4946                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4947                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4948                         }
4949 #define CLIPPEDVERTEXCOPY(k,p1) \
4950                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4951
4952 #define GENATTRIBCOPY(attrib, p1) \
4953                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4954 #define GENATTRIBLERP(attrib, p1, p2) \
4955                 { \
4956                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4957                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4958                 }
4959 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4960                 switch(clipcase) \
4961                 { \
4962                 default: \
4963                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4964                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4965                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4966                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4967                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4968                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4969                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4970                 }
4971
4972                 if (! clipped)
4973                         goto notclipped;
4974
4975                 // calculate distance from nearplane
4976                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4977                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4978                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4979                 if (clipdist[0] >= 0.0f)
4980                 {
4981                         if (clipdist[1] >= 0.0f)
4982                         {
4983                                 if (clipdist[2] >= 0.0f)
4984                                 {
4985                                 notclipped:
4986                                         // triangle is entirely in front of nearplane
4987                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4988                                         SKIPBACKFACE;
4989                                         numpoints = 3;
4990                                         clipcase = 0;
4991                                 }
4992                                 else
4993                                 {
4994                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4995                                         SKIPBACKFACE;
4996                                         numpoints = 4;
4997                                         clipcase = 1;
4998                                 }
4999                         }
5000                         else
5001                         {
5002                                 if (clipdist[2] >= 0.0f)
5003                                 {
5004                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5005                                         SKIPBACKFACE;
5006                                         numpoints = 4;
5007                                         clipcase = 2;
5008                                 }
5009                                 else
5010                                 {
5011                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5012                                         SKIPBACKFACE;
5013                                         numpoints = 3;
5014                                         clipcase = 3;
5015                                 }
5016                         }
5017                 }
5018                 else if (clipdist[1] >= 0.0f)
5019                 {
5020                         if (clipdist[2] >= 0.0f)
5021                         {
5022                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5023                                 SKIPBACKFACE;
5024                                 numpoints = 4;
5025                                 clipcase = 4;
5026                         }
5027                         else
5028                         {
5029                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5030                                 SKIPBACKFACE;
5031                                 numpoints = 3;
5032                                 clipcase = 5;
5033                         }
5034                 }
5035                 else if (clipdist[2] >= 0.0f)
5036                 {
5037                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5038                         SKIPBACKFACE;
5039                         numpoints = 3;
5040                         clipcase = 6;
5041                 }
5042                 else continue; // triangle is entirely behind nearplane
5043
5044                 {
5045                         // calculate integer y coords for triangle points
5046                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5047                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5048                                         screenmin = _mm_min_epi16(screeni, screenir),
5049                                         screenmax = _mm_max_epi16(screeni, screenir);
5050                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5051                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5052                         screenmin = _mm_max_epi16(screenmin, fbmin);
5053                         screenmax = _mm_min_epi16(screenmax, fbmax);
5054                         // skip offscreen triangles
5055                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5056                                 continue;
5057                         starty = _mm_extract_epi16(screenmin, 1);
5058                         endy = _mm_extract_epi16(screenmax, 1)+1;
5059                         if (starty >= maxy1 && endy <= miny2)
5060                                 continue;
5061                         screeny = _mm_srai_epi32(screeni, 16);
5062                 }
5063
5064                 triangle = &thread->triangles[thread->numtriangles];
5065
5066                 // calculate attribute plans for triangle data...
5067                 // okay, this triangle is going to produce spans, we'd better project
5068                 // the interpolants now (this is what gives perspective texturing),
5069                 // this consists of simply multiplying all arrays by the W coord
5070                 // (which is basically 1/Z), which will be undone per-pixel
5071                 // (multiplying by Z again) to get the perspective-correct array
5072                 // values
5073                 {
5074                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5075                         __m128 mipedgescale, mipdensity;
5076                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5077                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5078                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5079                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5080                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5081                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5082                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5083                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5084                         attribedge1 = _mm_sub_ss(w0, w1);
5085                         attribedge2 = _mm_sub_ss(w2, w1);
5086                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5087                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5088                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5089                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5090                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5091                         _mm_store_ss(&triangle->w[0], attribxslope);
5092                         _mm_store_ss(&triangle->w[1], attribyslope);
5093                         _mm_store_ss(&triangle->w[2], attriborigin);
5094                         
5095                         clip0origin = 0;
5096                         clip0slope = 0;
5097                         clip0dir = 0;
5098                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5099                         {
5100                                 float cliporigin, clipxslope, clipyslope;
5101                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5102                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5103                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5104                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5105                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5106                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5107                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5108                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5109                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5110                                 if(clipxslope != 0)
5111                                 {
5112                                         clip0origin = -cliporigin/clipxslope;
5113                                         clip0slope = -clipyslope/clipxslope;
5114                                         clip0dir = clipxslope > 0 ? 1 : -1;
5115                                 }
5116                                 else if(clipyslope > 0)
5117                                 {
5118                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5119                                         clip0slope = dpsoftrast.fb_width;
5120                                         clip0dir = -1;
5121                                 }
5122                                 else if(clipyslope < 0)
5123                                 {
5124                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5125                                         clip0slope = -dpsoftrast.fb_width;
5126                                         clip0dir = -1;
5127                                 }
5128                                 else if(clip0origin < 0) continue;
5129                         }
5130
5131                         mipedgescale = _mm_setzero_ps();
5132                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5133                         {
5134                                 __m128 attrib0, attrib1, attrib2;
5135                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5136                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5137                                         break;
5138                                 arrays += numvertices*4;
5139                                 GENATTRIBS(attrib0, attrib1, attrib2);
5140                                 attriborigin = _mm_mul_ps(attrib1, w1);
5141                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5142                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5143                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5144                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5145                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5146                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5147                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5148                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5149                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5150                                 {
5151                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5152                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5153                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5154                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5155                                 }
5156                         }
5157
5158                         memset(triangle->mip, 0, sizeof(triangle->mip));
5159                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5160                         {
5161                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5162                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5163                                         break;
5164                                 texture = thread->texbound[texunit];
5165                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5166                                 {
5167                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5168                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5169                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5170                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5171                                         // this will be multiplied in the texturing routine by the texture resolution
5172                                         y = _mm_cvtss_si32(mipdensity);
5173                                         if (y > 0)
5174                                         {
5175                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5176                                                 if (y > texture->mipmaps - 1)
5177                                                         y = texture->mipmaps - 1;
5178                                                 triangle->mip[texunit] = y;
5179                                         }
5180                                 }
5181                         }
5182                 }
5183         
5184                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5185                 for (; y < bandy;)
5186                 {
5187                         __m128 xcoords, xslope;
5188                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5189                         int yccmask = _mm_movemask_epi8(ycc);
5190                         int edge0p, edge0n, edge1p, edge1n;
5191                         int nexty;
5192                         float w, wslope;
5193                         float clip0;
5194                         if (numpoints == 4)
5195                         {
5196                                 switch(yccmask)
5197                                 {
5198                                 default:
5199                                 case 0xFFFF: /*0000*/ y = endy; continue;
5200                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5201                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5202                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5203                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5204                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5205                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5206                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5207                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5208                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5209                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5210                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5211                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5212                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5213                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5214                                 case 0x0000: /*1111*/ y++; continue;
5215                                 }
5216                         }
5217                         else
5218                         {
5219                                 switch(yccmask)
5220                                 {
5221                                 default:
5222                                 case 0xFFFF: /*000*/ y = endy; continue;
5223                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5224                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5225                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5226                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5227                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5228                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5229                                 case 0x0000: /*111*/ y++; continue;
5230                                 }
5231                         }
5232                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5233                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5234                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5235                         nexty = _mm_extract_epi16(ycc, 0);
5236                         if (nexty >= bandy) nexty = bandy-1;
5237                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5238                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5239                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5240                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5241                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5242                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5243                         {
5244                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5245                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5246                         }
5247                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5248                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5249                         {
5250                                 int startx, endx, offset;
5251                                 startx = _mm_cvtss_si32(xcoords);
5252                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5253                                 if (startx < minx) startx = minx;
5254                                 if (endx > maxx) endx = maxx;
5255                                 if (startx >= endx) continue;
5256
5257                                 if (clip0dir)
5258                                 {
5259                                         if (clip0dir > 0)
5260                                         {
5261                                                 if (startx < clip0) 
5262                                                 {
5263                                                         if(endx <= clip0) continue;
5264                                                         startx = (int)clip0;
5265                                                 }
5266                                         }
5267                                         else if (endx > clip0) 
5268                                         {
5269                                                 if(startx >= clip0) continue;
5270                                                 endx = (int)clip0;
5271                                         }
5272                                 }
5273                                                 
5274                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5275                                 {
5276                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5277                                         span->triangle = thread->numtriangles;
5278                                         span->x = offset;
5279                                         span->y = y;
5280                                         span->startx = 0;
5281                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5282                                         if (span->startx >= span->endx)
5283                                                 continue;
5284                                         wslope = triangle->w[0];
5285                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5286                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5287                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5288                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5289                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5290                                 }
5291                         }
5292                 }
5293
5294                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5295                 {
5296                         DPSOFTRAST_Draw_ProcessSpans(thread);
5297                         thread->numtriangles = 0;
5298                 }
5299         }
5300
5301         if (!ATOMIC_DECREMENT(command->refcount))
5302         {
5303                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5304                         MM_FREE(command->arrays);
5305         }
5306
5307         if (thread->numspans > 0 || thread->numtriangles > 0)
5308         {
5309                 DPSOFTRAST_Draw_ProcessSpans(thread);
5310                 thread->numtriangles = 0;
5311         }
5312 #endif
5313 }
5314
5315 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5316 {
5317         int i;
5318         int j;
5319         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5320         int datasize = 2*numvertices*sizeof(float[4]);
5321         DPSOFTRAST_Command_Draw *command;
5322         unsigned char *data;
5323         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5324         {
5325                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5326                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5327                         break;
5328                 datasize += numvertices*sizeof(float[4]);
5329         }
5330         if (element3s)
5331                 datasize += numtriangles*sizeof(unsigned short[3]);
5332         else if (element3i)
5333                 datasize += numtriangles*sizeof(int[3]);
5334         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5335         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5336         {
5337                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5338                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5339         }
5340         else
5341         {
5342                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5343                 data = (unsigned char *)command + commandsize;
5344         }
5345         command->firstvertex = firstvertex;
5346         command->numvertices = numvertices;
5347         command->numtriangles = numtriangles;
5348         command->arrays = (float *)data;
5349         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5350         dpsoftrast.firstvertex = firstvertex;
5351         dpsoftrast.numvertices = numvertices;
5352         dpsoftrast.screencoord4f = (float *)data;
5353         data += numvertices*sizeof(float[4]);
5354         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5355         data += numvertices*sizeof(float[4]);
5356         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5357         {
5358                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5359                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5360                         break;
5361                 dpsoftrast.post_array4f[j] = (float *)data;
5362                 data += numvertices*sizeof(float[4]);
5363         }
5364         command->element3i = NULL;
5365         command->element3s = NULL;
5366         if (element3s)
5367         {
5368                 command->element3s = (unsigned short *)data;
5369                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5370         }
5371         else if (element3i)
5372         {
5373                 command->element3i = (int *)data;
5374                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5375         }
5376         return command;
5377 }
5378
5379 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5380 {
5381         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5382         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5383         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5384         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5385         if (command->starty >= command->endy)
5386         {
5387                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5388                         MM_FREE(command->arrays);
5389                 DPSOFTRAST_UndoCommand(command->commandsize);
5390                 return;
5391         }
5392         command->clipped = dpsoftrast.drawclipped;
5393         command->refcount = dpsoftrast.numthreads;
5394
5395         if (dpsoftrast.usethreads)
5396         {
5397                 int i;
5398                 DPSOFTRAST_Draw_SyncCommands();
5399                 for (i = 0; i < dpsoftrast.numthreads; i++)
5400                 {
5401                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5402                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5403                                 Thread_CondSignal(thread->drawcond);
5404                 }
5405         }
5406         else
5407         {
5408                 DPSOFTRAST_Draw_FlushThreads();
5409         }
5410 }
5411
5412 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5413 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5414 {
5415         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5416 }
5417 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5418 {
5419         DPSOFTRAST_Command_SetRenderTargets *command;
5420         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5421                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5422                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5423                 DPSOFTRAST_Flush();
5424         dpsoftrast.fb_width = width;
5425         dpsoftrast.fb_height = height;
5426         dpsoftrast.fb_depthpixels = depthpixels;
5427         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5428         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5429         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5430         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5431         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5432         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5433         command->width = width;
5434         command->height = height;
5435 }
5436  
5437 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5438 {
5439         int commandoffset = thread->commandoffset;
5440         while (commandoffset != endoffset)
5441         {
5442                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5443                 switch (command->opcode)
5444                 {
5445 #define INTERPCOMMAND(name) \
5446                 case DPSOFTRAST_OPCODE_##name : \
5447                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5448                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5449                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5450                                 commandoffset = 0; \
5451                         break;
5452                 INTERPCOMMAND(Viewport)
5453                 INTERPCOMMAND(ClearColor)
5454                 INTERPCOMMAND(ClearDepth)
5455                 INTERPCOMMAND(ColorMask)
5456                 INTERPCOMMAND(DepthTest)
5457                 INTERPCOMMAND(ScissorTest)
5458                 INTERPCOMMAND(Scissor)
5459                 INTERPCOMMAND(BlendFunc)
5460                 INTERPCOMMAND(BlendSubtract)
5461                 INTERPCOMMAND(DepthMask)
5462                 INTERPCOMMAND(DepthFunc)
5463                 INTERPCOMMAND(DepthRange)
5464                 INTERPCOMMAND(PolygonOffset)
5465                 INTERPCOMMAND(CullFace)
5466                 INTERPCOMMAND(SetTexture)
5467                 INTERPCOMMAND(SetShader)
5468                 INTERPCOMMAND(Uniform4f)
5469                 INTERPCOMMAND(UniformMatrix4f)
5470                 INTERPCOMMAND(Uniform1i)
5471                 INTERPCOMMAND(SetRenderTargets)
5472                 INTERPCOMMAND(ClipPlane)
5473
5474                 case DPSOFTRAST_OPCODE_Draw:
5475                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5476                         commandoffset += command->commandsize;
5477                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5478                                 commandoffset = 0;
5479                         thread->commandoffset = commandoffset;
5480                         break;
5481
5482                 case DPSOFTRAST_OPCODE_Reset:
5483                         commandoffset = 0;
5484                         break;
5485                 }
5486         }
5487         thread->commandoffset = commandoffset;
5488 }
5489
5490 static int DPSOFTRAST_Draw_Thread(void *data)
5491 {
5492         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5493         while(thread->index >= 0)
5494         {
5495                 if (thread->commandoffset != dpsoftrast.drawcommand)
5496                 {
5497                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5498                 }
5499                 else 
5500                 {
5501                         Thread_LockMutex(thread->drawmutex);
5502                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5503                         {
5504                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5505                                 thread->starving = true;
5506                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5507                                 thread->starving = false;
5508                         }
5509                         Thread_UnlockMutex(thread->drawmutex);
5510                 }
5511         }   
5512         return 0;
5513 }
5514
5515 static void DPSOFTRAST_Draw_FlushThreads(void)
5516 {
5517         DPSOFTRAST_State_Thread *thread;
5518         int i;
5519         DPSOFTRAST_Draw_SyncCommands();
5520         if (dpsoftrast.usethreads) 
5521         {
5522                 for (i = 0; i < dpsoftrast.numthreads; i++)
5523                 {
5524                         thread = &dpsoftrast.threads[i];
5525                         if (thread->commandoffset != dpsoftrast.drawcommand)
5526                         {
5527                                 Thread_LockMutex(thread->drawmutex);
5528                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5529                                         Thread_CondSignal(thread->drawcond);
5530                                 Thread_UnlockMutex(thread->drawmutex);
5531                         }
5532                 }
5533                 for (i = 0; i < dpsoftrast.numthreads; i++)
5534                 {
5535                         thread = &dpsoftrast.threads[i];
5536                         if (thread->commandoffset != dpsoftrast.drawcommand)
5537                         {
5538                                 Thread_LockMutex(thread->drawmutex);
5539                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5540                                 {
5541                                         thread->waiting = true;
5542                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5543                                         thread->waiting = false;
5544                                 }
5545                                 Thread_UnlockMutex(thread->drawmutex);
5546                         }
5547                 }
5548         }
5549         else
5550         {
5551                 for (i = 0; i < dpsoftrast.numthreads; i++)
5552                 {
5553                         thread = &dpsoftrast.threads[i];
5554                         if (thread->commandoffset != dpsoftrast.drawcommand)
5555                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5556                 }
5557         }
5558         dpsoftrast.commandpool.usedcommands = 0;
5559 }
5560
5561 void DPSOFTRAST_Flush(void)
5562 {
5563         DPSOFTRAST_Draw_FlushThreads();
5564 }
5565
5566 void DPSOFTRAST_Finish(void)
5567 {
5568         DPSOFTRAST_Flush();
5569 }
5570
5571 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5572 {
5573         int i;
5574         union
5575         {
5576                 int i;
5577                 unsigned char b[4];
5578         }
5579         u;
5580         u.i = 1;
5581         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5582         dpsoftrast.bigendian = u.b[3];
5583         dpsoftrast.fb_width = width;
5584         dpsoftrast.fb_height = height;
5585         dpsoftrast.fb_depthpixels = depthpixels;
5586         dpsoftrast.fb_colorpixels[0] = colorpixels;
5587         dpsoftrast.fb_colorpixels[1] = NULL;
5588         dpsoftrast.fb_colorpixels[1] = NULL;
5589         dpsoftrast.fb_colorpixels[1] = NULL;
5590         dpsoftrast.viewport[0] = 0;
5591         dpsoftrast.viewport[1] = 0;
5592         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5593         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5594         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5595         dpsoftrast.texture_firstfree = 1;
5596         dpsoftrast.texture_end = 1;
5597         dpsoftrast.texture_max = 0;
5598         dpsoftrast.color[0] = 1;
5599         dpsoftrast.color[1] = 1;
5600         dpsoftrast.color[2] = 1;
5601         dpsoftrast.color[3] = 1;
5602         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5603         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5604         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5605         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5606         for (i = 0; i < dpsoftrast.numthreads; i++)
5607         {
5608                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5609                 thread->index = i;
5610                 thread->cullface = GL_BACK;
5611         thread->colormask[0] = 1; 
5612                 thread->colormask[1] = 1;
5613                 thread->colormask[2] = 1;
5614                 thread->colormask[3] = 1;
5615                 thread->blendfunc[0] = GL_ONE;
5616                 thread->blendfunc[1] = GL_ZERO;
5617                 thread->depthmask = true;
5618                 thread->depthtest = true;
5619                 thread->depthfunc = GL_LEQUAL;
5620                 thread->scissortest = false;
5621                 thread->viewport[0] = 0;
5622                 thread->viewport[1] = 0;
5623                 thread->viewport[2] = dpsoftrast.fb_width;
5624                 thread->viewport[3] = dpsoftrast.fb_height;
5625                 thread->scissor[0] = 0;
5626                 thread->scissor[1] = 0;
5627                 thread->scissor[2] = dpsoftrast.fb_width;
5628                 thread->scissor[3] = dpsoftrast.fb_height;
5629                 thread->depthrange[0] = 0;
5630                 thread->depthrange[1] = 1;
5631                 thread->polygonoffset[0] = 0;
5632                 thread->polygonoffset[1] = 0;
5633                 thread->clipplane[0] = 0;
5634                 thread->clipplane[1] = 0;
5635                 thread->clipplane[2] = 0;
5636                 thread->clipplane[3] = 1;
5637         
5638                 thread->numspans = 0;
5639                 thread->numtriangles = 0;
5640                 thread->commandoffset = 0;
5641                 thread->waiting = false;
5642                 thread->starving = false;
5643            
5644                 thread->validate = -1;
5645                 DPSOFTRAST_Validate(thread, -1);
5646  
5647                 if (dpsoftrast.usethreads)
5648                 {
5649                         thread->waitcond = Thread_CreateCond();
5650                         thread->drawcond = Thread_CreateCond();
5651                         thread->drawmutex = Thread_CreateMutex();
5652                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5653                 }
5654         }
5655         return 0;
5656 }
5657
5658 void DPSOFTRAST_Shutdown(void)
5659 {
5660         int i;
5661         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5662         {
5663                 DPSOFTRAST_State_Thread *thread;
5664                 for (i = 0; i < dpsoftrast.numthreads; i++)
5665                 {
5666                         thread = &dpsoftrast.threads[i];
5667                         Thread_LockMutex(thread->drawmutex);
5668                         thread->index = -1;
5669                         Thread_CondSignal(thread->drawcond);
5670                         Thread_UnlockMutex(thread->drawmutex);
5671                         Thread_WaitThread(thread->thread, 0);
5672                         Thread_DestroyCond(thread->waitcond);
5673                         Thread_DestroyCond(thread->drawcond);
5674                         Thread_DestroyMutex(thread->drawmutex);
5675                 }
5676         }
5677         for (i = 0;i < dpsoftrast.texture_end;i++)
5678                 if (dpsoftrast.texture[i].bytes)
5679                         MM_FREE(dpsoftrast.texture[i].bytes);
5680         if (dpsoftrast.texture)
5681                 free(dpsoftrast.texture);
5682         if (dpsoftrast.threads)
5683                 MM_FREE(dpsoftrast.threads);
5684         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5685 }
5686