]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
avoid bugs introduced by the attempt to skip image processing - a closer
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(32)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int alphatest;
253         int alphafunc;
254         float alphavalue;
255         int viewport[4];
256         int scissor[4];
257         float depthrange[2];
258         float polygonoffset[2];
259         float clipplane[4];
260         ALIGN(float fb_clipplane[4]);
261
262         int shader_mode;
263         int shader_permutation;
264         int shader_exactspecularmath;
265
266         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
267         
268         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
269         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
270
271         // DPSOFTRAST_VALIDATE_ flags
272         int validate;
273
274         // derived values (DPSOFTRAST_VALIDATE_FB)
275         int fb_colormask;
276         int fb_scissor[4];
277         ALIGN(float fb_viewportcenter[4]);
278         ALIGN(float fb_viewportscale[4]);
279
280         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
281         int fb_depthfunc;
282
283         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
284         int fb_blendmode;
285
286         // band boundaries
287         int miny1;
288         int maxy1;
289         int miny2;
290         int maxy2;
291
292         ATOMIC(volatile int commandoffset);
293
294         volatile bool waiting;
295         volatile bool starving;
296         void *waitcond;
297         void *drawcond;
298         void *drawmutex;
299
300         int numspans;
301         int numtriangles;
302         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
303         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
304         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
305 }
306 DPSOFTRAST_State_Thread);
307
308 typedef ATOMIC(struct DPSOFTRAST_State_s
309 {
310         int fb_width;
311         int fb_height;
312         unsigned int *fb_depthpixels;
313         unsigned int *fb_colorpixels[4];
314
315         int viewport[4];
316         ALIGN(float fb_viewportcenter[4]);
317         ALIGN(float fb_viewportscale[4]);
318
319         float color[4];
320         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
321         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
322
323         const float *pointer_vertex3f;
324         const float *pointer_color4f;
325         const unsigned char *pointer_color4ub;
326         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int stride_vertex;
328         int stride_color;
329         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
330         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
331         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
332
333         int firstvertex;
334         int numvertices;
335         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
336         float *screencoord4f;
337         int drawstarty;
338         int drawendy;
339         int drawclipped;
340         
341         int shader_mode;
342         int shader_permutation;
343         int shader_exactspecularmath;
344
345         int texture_max;
346         int texture_end;
347         int texture_firstfree;
348         DPSOFTRAST_Texture *texture;
349
350         int bigendian;
351
352         // error reporting
353         const char *errorstring;
354
355         bool usethreads;
356         int interlace;
357         int numthreads;
358         DPSOFTRAST_State_Thread *threads;
359
360         ATOMIC(volatile int drawcommand);
361
362         DPSOFTRAST_State_Command_Pool commandpool;
363 }
364 DPSOFTRAST_State);
365
366 DPSOFTRAST_State dpsoftrast;
367
368 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
369 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
370 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
371 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
372
373 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
374 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
375
376 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
377 {
378         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
379         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
380         fb_viewportcenter[3] = 0.5f;
381         fb_viewportcenter[0] = 0.0f;
382         fb_viewportscale[1] = 0.5f * viewport[2];
383         fb_viewportscale[2] = -0.5f * viewport[3];
384         fb_viewportscale[3] = 0.5f;
385         fb_viewportscale[0] = 1.0f;
386 }
387
388 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
389 {
390         if (dpsoftrast.interlace)
391         {
392                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
394                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
395                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
396         }
397         else
398         {
399                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
400                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
401         }
402 }
403
404 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
405 {
406         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
407         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
408         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
409         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
410         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
411 }
412
413 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
414 {
415         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
416         // and viewport projection values
417         int x1, x2;
418         int y1, y2;
419         x1 = thread->scissor[0];
420         x2 = thread->scissor[0] + thread->scissor[2];
421         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
422         y2 = dpsoftrast.fb_height - thread->scissor[1];
423         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
424         if (x1 < 0) x1 = 0;
425         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
426         if (y1 < 0) y1 = 0;
427         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
428         thread->fb_scissor[0] = x1;
429         thread->fb_scissor[1] = y1;
430         thread->fb_scissor[2] = x2 - x1;
431         thread->fb_scissor[3] = y2 - y1;
432
433         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
434         DPSOFTRAST_RecalcClipPlane(thread);
435         DPSOFTRAST_RecalcThread(thread);
436 }
437
438 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
439 {
440         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
441 }
442
443 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
444 {
445         if (thread->blendsubtract)
446         {
447                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
448                 {
449                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
450                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
451                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
452                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
453                 }
454         }
455         else
456         {       
457                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
458                 {
459                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
460                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
461                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
462                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
463                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
464                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
465                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
466                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
467                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
468                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
469                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
470                 }
471         }
472 }
473
474 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
475
476 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
477 {
478         mask &= thread->validate;
479         if (!mask)
480                 return;
481         if (mask & DPSOFTRAST_VALIDATE_FB)
482         {
483                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
484                 DPSOFTRAST_RecalcFB(thread);
485         }
486         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
487         {
488                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
489                 DPSOFTRAST_RecalcDepthFunc(thread);
490         }
491         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
492         {
493                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
494                 DPSOFTRAST_RecalcBlendFunc(thread);
495         }
496 }
497
498 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
499 {
500         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
501                 return &dpsoftrast.texture[index];
502         return NULL;
503 }
504
505 static void DPSOFTRAST_Texture_Grow(void)
506 {
507         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
508         DPSOFTRAST_State_Thread *thread;
509         int i;
510         int j;
511         DPSOFTRAST_Flush();
512         // expand texture array as needed
513         if (dpsoftrast.texture_max < 1024)
514                 dpsoftrast.texture_max = 1024;
515         else
516                 dpsoftrast.texture_max *= 2;
517         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
518         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
519                 if (dpsoftrast.texbound[i])
520                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
521         for (j = 0; j < dpsoftrast.numthreads; j++)
522         {
523                 thread = &dpsoftrast.threads[j];
524                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
525                         if (thread->texbound[i])
526                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
527         }
528 }
529
530 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
531 {
532         int w;
533         int h;
534         int d;
535         int size;
536         int s;
537         int texnum;
538         int mipmaps;
539         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
540         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
541         DPSOFTRAST_Texture *texture;
542         if (width*height*depth < 1)
543         {
544                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
545                 return 0;
546         }
547         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
548         {
549                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
550                 return 0;
551         }
552         switch(texformat)
553         {
554         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
555         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
556         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
557                 break;
558         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
559                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
560                 {
561                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
562                         return 0;
563                 }
564                 if (depth != 1)
565                 {
566                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
567                         return 0;
568                 }
569                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
570                 {
571                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
572                         return 0;
573                 }
574                 break;
575         }
576         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
577         {
578                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
579                 return 0;
580         }
581         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
582         {
583                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
584                 return 0;
585         }
586         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
587         {
588                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
589                 return 0;
590         }
591         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
592         {
593                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
594                 return 0;
595         }
596         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
597         {
598                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
599                 return 0;
600         }
601         // find first empty slot in texture array
602         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
603                 if (!dpsoftrast.texture[texnum].bytes)
604                         break;
605         dpsoftrast.texture_firstfree = texnum + 1;
606         if (dpsoftrast.texture_max <= texnum)
607                 DPSOFTRAST_Texture_Grow();
608         if (dpsoftrast.texture_end <= texnum)
609                 dpsoftrast.texture_end = texnum + 1;
610         texture = &dpsoftrast.texture[texnum];
611         memset(texture, 0, sizeof(*texture));
612         texture->flags = flags;
613         texture->width = width;
614         texture->height = height;
615         texture->depth = depth;
616         texture->sides = sides;
617         texture->binds = 0;
618         w = width;
619         h = height;
620         d = depth;
621         size = 0;
622         mipmaps = 0;
623         w = width;
624         h = height;
625         d = depth;
626         for (;;)
627         {
628                 s = w * h * d * sides * 4;
629                 texture->mipmap[mipmaps][0] = size;
630                 texture->mipmap[mipmaps][1] = s;
631                 texture->mipmap[mipmaps][2] = w;
632                 texture->mipmap[mipmaps][3] = h;
633                 texture->mipmap[mipmaps][4] = d;
634                 size += s;
635                 mipmaps++;
636                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
637                         break;
638                 if (w > 1) w >>= 1;
639                 if (h > 1) h >>= 1;
640                 if (d > 1) d >>= 1;
641         }
642         texture->mipmaps = mipmaps;
643         texture->size = size;
644
645         // allocate the pixels now
646         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
647
648         return texnum;
649 }
650 void DPSOFTRAST_Texture_Free(int index)
651 {
652         DPSOFTRAST_Texture *texture;
653         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654         if (texture->binds)
655                 DPSOFTRAST_Flush();
656         if (texture->bytes)
657                 MM_FREE(texture->bytes);
658         texture->bytes = NULL;
659         memset(texture, 0, sizeof(*texture));
660         // adjust the free range and used range
661         if (dpsoftrast.texture_firstfree > index)
662                 dpsoftrast.texture_firstfree = index;
663         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
664                 dpsoftrast.texture_end--;
665 }
666 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
667 {
668         int i, x, y, z, w, layer0, layer1, row0, row1;
669         unsigned char *o, *i0, *i1, *i2, *i3;
670         DPSOFTRAST_Texture *texture;
671         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
672         if (texture->mipmaps <= 1)
673                 return;
674         for (i = 1;i < texture->mipmaps;i++)
675         {
676                 for (z = 0;z < texture->mipmap[i][4];z++)
677                 {
678                         layer0 = z*2;
679                         layer1 = z*2+1;
680                         if (layer1 >= texture->mipmap[i-1][4])
681                                 layer1 = texture->mipmap[i-1][4]-1;
682                         for (y = 0;y < texture->mipmap[i][3];y++)
683                         {
684                                 row0 = y*2;
685                                 row1 = y*2+1;
686                                 if (row1 >= texture->mipmap[i-1][3])
687                                         row1 = texture->mipmap[i-1][3]-1;
688                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
689                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
690                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
691                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
692                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
693                                 w = texture->mipmap[i][2];
694                                 if (layer1 > layer0)
695                                 {
696                                         if (texture->mipmap[i-1][2] > 1)
697                                         {
698                                                 // average 3D texture
699                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
700                                                 {
701                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
702                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
703                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
704                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
705                                                 }
706                                         }
707                                         else
708                                         {
709                                                 // average 3D mipmap with parent width == 1
710                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
711                                                 {
712                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
713                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
714                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
715                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
716                                                 }
717                                         }
718                                 }
719                                 else
720                                 {
721                                         if (texture->mipmap[i-1][2] > 1)
722                                         {
723                                                 // average 2D texture (common case)
724                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
725                                                 {
726                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
727                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
728                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
729                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
730                                                 }
731                                         }
732                                         else
733                                         {
734                                                 // 2D texture with parent width == 1
735                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
736                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
737                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
738                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
739                                         }
740                                 }
741                         }
742                 }
743         }
744 }
745 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
746 {
747         DPSOFTRAST_Texture *texture;
748         unsigned char *dst;
749         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
750         if (texture->binds)
751                 DPSOFTRAST_Flush();
752         if (pixels)
753         {
754                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
755                 while (blockheight > 0)
756                 {
757                         memcpy(dst, pixels, blockwidth * 4);
758                         pixels += blockwidth * 4;
759                         dst += texture->mipmap[0][2] * 4;
760                         blockheight--;
761                 }
762         }
763         DPSOFTRAST_Texture_CalculateMipmaps(index);
764 }
765 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
766 {
767         DPSOFTRAST_Texture *texture;
768         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
769         if (texture->binds)
770                 DPSOFTRAST_Flush();
771         if (pixels)
772                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
773         DPSOFTRAST_Texture_CalculateMipmaps(index);
774 }
775 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
776 {
777         DPSOFTRAST_Texture *texture;
778         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
779         return texture->mipmap[mip][2];
780 }
781 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
782 {
783         DPSOFTRAST_Texture *texture;
784         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
785         return texture->mipmap[mip][3];
786 }
787 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
788 {
789         DPSOFTRAST_Texture *texture;
790         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
791         return texture->mipmap[mip][4];
792 }
793 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
794 {
795         DPSOFTRAST_Texture *texture;
796         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
797         if (texture->binds)
798                 DPSOFTRAST_Flush();
799         return texture->bytes + texture->mipmap[mip][0];
800 }
801 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
802 {
803         DPSOFTRAST_Texture *texture;
804         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
805         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
806         {
807                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
808                 return;
809         }
810         if (texture->binds)
811                 DPSOFTRAST_Flush();
812         texture->filter = filter;
813 }
814
815 static void DPSOFTRAST_Draw_FlushThreads(void);
816
817 static void DPSOFTRAST_Draw_SyncCommands(void)
818 {
819         if(dpsoftrast.usethreads) MEMORY_BARRIER;
820         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
821 }
822
823 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
824 {
825         DPSOFTRAST_State_Thread *thread;
826         int i;
827         int freecommand = dpsoftrast.commandpool.freecommand;
828         int usedcommands = dpsoftrast.commandpool.usedcommands;
829         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
830                 return;
831         DPSOFTRAST_Draw_SyncCommands();
832         for(;;)
833         {
834                 int waitindex = -1;
835                 int commandoffset;
836                 usedcommands = 0;
837                 for (i = 0; i < dpsoftrast.numthreads; i++)
838                 {
839                         thread = &dpsoftrast.threads[i]; 
840                         commandoffset = freecommand - thread->commandoffset;
841                         if (commandoffset < 0)
842                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
843                         if (commandoffset > usedcommands)
844                         {
845                                 waitindex = i;
846                                 usedcommands = commandoffset;
847                         }
848                 }
849                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
850                         break;
851                 thread = &dpsoftrast.threads[waitindex];
852                 Thread_LockMutex(thread->drawmutex);
853                 if (thread->commandoffset != dpsoftrast.drawcommand)
854                 {
855                         thread->waiting = true;
856                         if (thread->starving) Thread_CondSignal(thread->drawcond);
857                         Thread_CondWait(thread->waitcond, thread->drawmutex);
858                         thread->waiting = false;
859                 }
860                 Thread_UnlockMutex(thread->drawmutex);
861         }
862         dpsoftrast.commandpool.usedcommands = usedcommands;
863 }
864
865 #define DPSOFTRAST_ALIGNCOMMAND(size) \
866         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
867 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
868         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
869
870 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
871 {
872         DPSOFTRAST_Command *command;
873         int freecommand = dpsoftrast.commandpool.freecommand;
874         int usedcommands = dpsoftrast.commandpool.usedcommands;
875         int extra = sizeof(DPSOFTRAST_Command);
876         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
877                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
878         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
879         {
880                 if (dpsoftrast.usethreads)
881                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
882                 else
883                         DPSOFTRAST_Draw_FlushThreads();
884                 freecommand = dpsoftrast.commandpool.freecommand;
885                 usedcommands = dpsoftrast.commandpool.usedcommands;
886         }
887         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
888         {
889                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
890                 command->opcode = DPSOFTRAST_OPCODE_Reset;
891                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
892                 freecommand = 0;
893         }
894         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
895         command->opcode = opcode;
896         command->commandsize = size;
897         freecommand += size;
898         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
899                 freecommand = 0;
900         dpsoftrast.commandpool.freecommand = freecommand;
901         dpsoftrast.commandpool.usedcommands = usedcommands + size;
902         return command;
903 }
904
905 static void DPSOFTRAST_UndoCommand(int size)
906 {
907         int freecommand = dpsoftrast.commandpool.freecommand;
908         int usedcommands = dpsoftrast.commandpool.usedcommands;
909         freecommand -= size;
910         if (freecommand < 0)
911                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
912         usedcommands -= size;
913         dpsoftrast.commandpool.freecommand = freecommand;
914         dpsoftrast.commandpool.usedcommands = usedcommands;
915 }
916                 
917 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
918 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
919 {
920         thread->viewport[0] = command->x;
921         thread->viewport[1] = command->y;
922         thread->viewport[2] = command->width;
923         thread->viewport[3] = command->height;
924         thread->validate |= DPSOFTRAST_VALIDATE_FB;
925 }
926 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
927 {
928         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
929         command->x = x;
930         command->y = y;
931         command->width = width;
932         command->height = height;
933
934         dpsoftrast.viewport[0] = x;
935         dpsoftrast.viewport[1] = y;
936         dpsoftrast.viewport[2] = width;
937         dpsoftrast.viewport[3] = height;
938         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
939 }
940
941 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
942 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
943 {
944         int i, x1, y1, x2, y2, w, h, x, y;
945         int miny1, maxy1, miny2, maxy2;
946         int bandy;
947         unsigned int *p;
948         unsigned int c;
949         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
950         miny1 = thread->miny1;
951         maxy1 = thread->maxy1;
952         miny2 = thread->miny2;
953         maxy2 = thread->maxy2;
954         x1 = thread->fb_scissor[0];
955         y1 = thread->fb_scissor[1];
956         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
957         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
958         if (y1 < miny1) y1 = miny1;
959         if (y2 > maxy2) y2 = maxy2;
960         w = x2 - x1;
961         h = y2 - y1;
962         if (w < 1 || h < 1)
963                 return;
964         // FIXME: honor fb_colormask?
965         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
966         for (i = 0;i < 4;i++)
967         {
968                 if (!dpsoftrast.fb_colorpixels[i])
969                         continue;
970                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
971                 for (;y < bandy;y++)
972                 {
973                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
974                         for (x = x1;x < x2;x++)
975                                 p[x] = c;
976                 }
977         }
978 }
979 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
980 {
981         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
982         command->r = r;
983         command->g = g;
984         command->b = b;
985         command->a = a;
986 }
987
988 DEFCOMMAND(3, ClearDepth, float depth;)
989 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
990 {
991         int x1, y1, x2, y2, w, h, x, y;
992         int miny1, maxy1, miny2, maxy2;
993         int bandy;
994         unsigned int *p;
995         unsigned int c;
996         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
997         miny1 = thread->miny1;
998         maxy1 = thread->maxy1;
999         miny2 = thread->miny2;
1000         maxy2 = thread->maxy2;
1001         x1 = thread->fb_scissor[0];
1002         y1 = thread->fb_scissor[1];
1003         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1004         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1005         if (y1 < miny1) y1 = miny1;
1006         if (y2 > maxy2) y2 = maxy2;
1007         w = x2 - x1;
1008         h = y2 - y1;
1009         if (w < 1 || h < 1)
1010                 return;
1011         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1012         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1013         for (;y < bandy;y++)
1014         {
1015                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1016                 for (x = x1;x < x2;x++)
1017                         p[x] = c;
1018         }
1019 }
1020 void DPSOFTRAST_ClearDepth(float d)
1021 {
1022         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1023         command->depth = d;
1024 }
1025
1026 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1027 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1028 {
1029         thread->colormask[0] = command->r != 0;
1030         thread->colormask[1] = command->g != 0;
1031         thread->colormask[2] = command->b != 0;
1032         thread->colormask[3] = command->a != 0;
1033         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1034 }
1035 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1036 {
1037         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1038         command->r = r;
1039         command->g = g;
1040         command->b = b;
1041         command->a = a;
1042 }
1043
1044 DEFCOMMAND(5, DepthTest, int enable;)
1045 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1046 {
1047         thread->depthtest = command->enable;
1048         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1049 }
1050 void DPSOFTRAST_DepthTest(int enable)
1051 {
1052         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1053         command->enable = enable;
1054 }
1055
1056 DEFCOMMAND(6, ScissorTest, int enable;)
1057 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1058 {
1059         thread->scissortest = command->enable;
1060         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1061 }
1062 void DPSOFTRAST_ScissorTest(int enable)
1063 {
1064         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1065         command->enable = enable;
1066 }
1067
1068 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1069 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1070 {
1071         thread->scissor[0] = command->x;
1072         thread->scissor[1] = command->y;
1073         thread->scissor[2] = command->width;
1074         thread->scissor[3] = command->height;
1075         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1076 }
1077 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1078 {
1079         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1080         command->x = x;
1081         command->y = y;
1082         command->width = width;
1083         command->height = height;
1084 }
1085
1086 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1087 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1088 {
1089         thread->blendfunc[0] = command->sfactor;
1090         thread->blendfunc[1] = command->dfactor;
1091         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1092 }
1093 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1094 {
1095         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1096         command->sfactor = sfactor;
1097         command->dfactor = dfactor;
1098 }
1099
1100 DEFCOMMAND(9, BlendSubtract, int enable;)
1101 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1102 {
1103         thread->blendsubtract = command->enable;
1104         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1105 }
1106 void DPSOFTRAST_BlendSubtract(int enable)
1107 {
1108         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1109         command->enable = enable;
1110 }
1111
1112 DEFCOMMAND(10, DepthMask, int enable;)
1113 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1114 {
1115         thread->depthmask = command->enable;
1116 }
1117 void DPSOFTRAST_DepthMask(int enable)
1118 {
1119         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1120         command->enable = enable;
1121 }
1122
1123 DEFCOMMAND(11, DepthFunc, int func;)
1124 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1125 {
1126         thread->depthfunc = command->func;
1127 }
1128 void DPSOFTRAST_DepthFunc(int func)
1129 {
1130         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1131         command->func = func;
1132 }
1133
1134 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1135 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1136 {
1137         thread->depthrange[0] = command->nearval;
1138         thread->depthrange[1] = command->farval;
1139 }
1140 void DPSOFTRAST_DepthRange(float nearval, float farval)
1141 {
1142         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1143         command->nearval = nearval;
1144         command->farval = farval;
1145 }
1146
1147 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1148 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1149 {
1150         thread->polygonoffset[0] = command->alongnormal;
1151         thread->polygonoffset[1] = command->intoview;
1152 }
1153 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1154 {
1155         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1156         command->alongnormal = alongnormal;
1157         command->intoview = intoview;
1158 }
1159
1160 DEFCOMMAND(14, CullFace, int mode;)
1161 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1162 {
1163         thread->cullface = command->mode;
1164 }
1165 void DPSOFTRAST_CullFace(int mode)
1166 {
1167         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1168         command->mode = mode;
1169 }
1170
1171 DEFCOMMAND(15, AlphaTest, int enable;)
1172 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1173 {
1174         thread->alphatest = command->enable;
1175 }
1176 void DPSOFTRAST_AlphaTest(int enable)
1177 {
1178         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1179         command->enable = enable;
1180 }
1181
1182 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1183 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1184 {
1185         thread->alphafunc = command->func;
1186         thread->alphavalue = command->ref;
1187 }
1188 void DPSOFTRAST_AlphaFunc(int func, float ref)
1189 {
1190         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1191         command->func = func;
1192         command->ref = ref;
1193 }
1194
1195 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1196 {
1197         dpsoftrast.color[0] = r;
1198         dpsoftrast.color[1] = g;
1199         dpsoftrast.color[2] = b;
1200         dpsoftrast.color[3] = a;
1201 }
1202
1203 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1204 {
1205         int outstride = blockwidth * 4;
1206         int instride = dpsoftrast.fb_width * 4;
1207         int bx1 = blockx;
1208         int by1 = blocky;
1209         int bx2 = blockx + blockwidth;
1210         int by2 = blocky + blockheight;
1211         int bw;
1212         int x;
1213         int y;
1214         unsigned char *inpixels;
1215         unsigned char *b;
1216         unsigned char *o;
1217         DPSOFTRAST_Flush();
1218         if (bx1 < 0) bx1 = 0;
1219         if (by1 < 0) by1 = 0;
1220         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1221         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1222         bw = bx2 - bx1;
1223         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1224         if (dpsoftrast.bigendian)
1225         {
1226                 for (y = by1;y < by2;y++)
1227                 {
1228                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1229                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1230                         for (x = bx1;x < bx2;x++)
1231                         {
1232                                 o[0] = b[3];
1233                                 o[1] = b[2];
1234                                 o[2] = b[1];
1235                                 o[3] = b[0];
1236                                 o += 4;
1237                                 b += 4;
1238                         }
1239                 }
1240         }
1241         else
1242         {
1243                 for (y = by1;y < by2;y++)
1244                 {
1245                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1246                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1247                         memcpy(o, b, bw*4);
1248                 }
1249         }
1250
1251 }
1252 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1253 {
1254         int tx1 = tx;
1255         int ty1 = ty;
1256         int tx2 = tx + width;
1257         int ty2 = ty + height;
1258         int sx1 = sx;
1259         int sy1 = sy;
1260         int sx2 = sx + width;
1261         int sy2 = sy + height;
1262         int swidth;
1263         int sheight;
1264         int twidth;
1265         int theight;
1266         int sw;
1267         int sh;
1268         int tw;
1269         int th;
1270         int y;
1271         unsigned int *spixels;
1272         unsigned int *tpixels;
1273         DPSOFTRAST_Texture *texture;
1274         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1275         if (mip < 0 || mip >= texture->mipmaps) return;
1276         DPSOFTRAST_Flush();
1277         spixels = dpsoftrast.fb_colorpixels[0];
1278         swidth = dpsoftrast.fb_width;
1279         sheight = dpsoftrast.fb_height;
1280         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1281         twidth = texture->mipmap[mip][2];
1282         theight = texture->mipmap[mip][3];
1283         if (tx1 < 0) tx1 = 0;
1284         if (ty1 < 0) ty1 = 0;
1285         if (tx2 > twidth) tx2 = twidth;
1286         if (ty2 > theight) ty2 = theight;
1287         if (sx1 < 0) sx1 = 0;
1288         if (sy1 < 0) sy1 = 0;
1289         if (sx2 > swidth) sx2 = swidth;
1290         if (sy2 > sheight) sy2 = sheight;
1291         tw = tx2 - tx1;
1292         th = ty2 - ty1;
1293         sw = sx2 - sx1;
1294         sh = sy2 - sy1;
1295         if (tw > sw) tw = sw;
1296         if (th > sh) th = sh;
1297         if (tw < 1 || th < 1)
1298                 return;
1299         sy1 = sheight - 1 - sy1;
1300         for (y = 0;y < th;y++)
1301                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1302         if (texture->mipmaps > 1)
1303                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1304 }
1305
1306 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1307 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1308 {
1309         if (thread->texbound[command->unitnum])
1310                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1311         thread->texbound[command->unitnum] = command->texture;
1312 }
1313 void DPSOFTRAST_SetTexture(int unitnum, int index)
1314 {
1315         DPSOFTRAST_Command_SetTexture *command;
1316         DPSOFTRAST_Texture *texture;
1317         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1318         {
1319                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1320                 return;
1321         }
1322         texture = DPSOFTRAST_Texture_GetByIndex(index);
1323         if (index && !texture)
1324         {
1325                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1326                 return;
1327         }
1328
1329         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1330         command->unitnum = unitnum;
1331         command->texture = texture;
1332
1333         dpsoftrast.texbound[unitnum] = texture;
1334         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1335 }
1336
1337 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1338 {
1339         dpsoftrast.pointer_vertex3f = vertex3f;
1340         dpsoftrast.stride_vertex = stride;
1341 }
1342 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1343 {
1344         dpsoftrast.pointer_color4f = color4f;
1345         dpsoftrast.pointer_color4ub = NULL;
1346         dpsoftrast.stride_color = stride;
1347 }
1348 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1349 {
1350         dpsoftrast.pointer_color4f = NULL;
1351         dpsoftrast.pointer_color4ub = color4ub;
1352         dpsoftrast.stride_color = stride;
1353 }
1354 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1355 {
1356         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1357         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1358         dpsoftrast.stride_texcoord[unitnum] = stride;
1359 }
1360
1361 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1362 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1363 {
1364         thread->shader_mode = command->mode;
1365         thread->shader_permutation = command->permutation;
1366         thread->shader_exactspecularmath = command->exactspecularmath;
1367 }
1368 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1369 {
1370         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1371         command->mode = mode;
1372         command->permutation = permutation;
1373         command->exactspecularmath = exactspecularmath;
1374
1375         dpsoftrast.shader_mode = mode;
1376         dpsoftrast.shader_permutation = permutation;
1377         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1378 }
1379
1380 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1381 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1382 {
1383         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1384 }
1385 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1386 {
1387         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1388         command->index = index;
1389         command->val[0] = v0;
1390         command->val[1] = v1;
1391         command->val[2] = v2;
1392         command->val[3] = v3;
1393
1394         dpsoftrast.uniform4f[index*4+0] = v0;
1395         dpsoftrast.uniform4f[index*4+1] = v1;
1396         dpsoftrast.uniform4f[index*4+2] = v2;
1397         dpsoftrast.uniform4f[index*4+3] = v3;
1398 }
1399 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1400 {
1401         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1402         command->index = index;
1403         memcpy(command->val, v, sizeof(command->val));
1404
1405         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1406 }
1407
1408 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1409 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1410 {
1411         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1412 }
1413 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1414 {
1415 #ifdef SSE_POSSIBLE
1416         int i, index;
1417         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1418         {
1419                 __m128 m0, m1, m2, m3;
1420                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1421                 command->index = (DPSOFTRAST_UNIFORM)index;
1422                 if (((size_t)v)&(ALIGN_SIZE-1))
1423                 {
1424                         m0 = _mm_loadu_ps(v);
1425                         m1 = _mm_loadu_ps(v+4);
1426                         m2 = _mm_loadu_ps(v+8);
1427                         m3 = _mm_loadu_ps(v+12);
1428                 }
1429                 else
1430                 {
1431                         m0 = _mm_load_ps(v);
1432                         m1 = _mm_load_ps(v+4);
1433                         m2 = _mm_load_ps(v+8);
1434                         m3 = _mm_load_ps(v+12);
1435                 }
1436                 if (transpose)
1437                 {
1438                         __m128 t0, t1, t2, t3;
1439                         t0 = _mm_unpacklo_ps(m0, m1);
1440                         t1 = _mm_unpacklo_ps(m2, m3);
1441                         t2 = _mm_unpackhi_ps(m0, m1);
1442                         t3 = _mm_unpackhi_ps(m2, m3);
1443                         m0 = _mm_movelh_ps(t0, t1);
1444                         m1 = _mm_movehl_ps(t1, t0);
1445                         m2 = _mm_movelh_ps(t2, t3);
1446                         m3 = _mm_movehl_ps(t3, t2);                     
1447                 }
1448                 _mm_store_ps(command->val, m0);
1449                 _mm_store_ps(command->val+4, m1);
1450                 _mm_store_ps(command->val+8, m2);
1451                 _mm_store_ps(command->val+12, m3);
1452                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1453                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1454                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1455                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1456         }
1457 #endif
1458 }
1459
1460 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1461 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1462 {
1463         thread->uniform1i[command->index] = command->val;
1464 }
1465 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1466 {
1467         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1468         command->index = index;
1469         command->val = i0;
1470
1471         dpsoftrast.uniform1i[command->index] = i0;
1472 }
1473
1474 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1475 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1476 {
1477         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1478         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1479 }
1480 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1481 {
1482         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1483         command->clipplane[0] = x;
1484         command->clipplane[1] = y;
1485         command->clipplane[2] = z;
1486         command->clipplane[3] = w;
1487 }
1488
1489 #ifdef SSE_POSSIBLE
1490 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1491 {
1492         float *end = dst + size*4;
1493         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1494         {
1495                 while (dst < end)
1496                 {
1497                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1498                         dst += 4;
1499                         src += stride;
1500                 }
1501         }
1502         else
1503         {
1504                 while (dst < end)
1505                 {
1506                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1507                         dst += 4;
1508                         src += stride;
1509                 }
1510         }
1511 }
1512
1513 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1514 {
1515         float *end = dst + size*4;
1516         if (stride == sizeof(float[3]))
1517         {
1518                 float *end4 = dst + (size&~3)*4;        
1519                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1520                 {
1521                         while (dst < end4)
1522                         {
1523                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1524                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1525                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1526                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1527                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1528                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1529                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1531                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1532                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1533                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1534                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1535                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1536                                 dst += 16;
1537                                 src += 4*sizeof(float[3]);
1538                         }
1539                 }
1540                 else
1541                 {
1542                         while (dst < end4)
1543                         {
1544                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1545                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1546                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1547                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1548                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1549                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1550                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1551                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1552                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1553                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1554                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1555                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1556                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1557                                 dst += 16;
1558                                 src += 4*sizeof(float[3]);
1559                         }
1560                 }
1561         }
1562         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1563         {
1564                 while (dst < end)
1565                 {
1566                         __m128 v = _mm_loadu_ps((const float *)src);
1567                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1568                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1569                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1570                         _mm_store_ps(dst, v);
1571                         dst += 4;
1572                         src += stride;
1573                 }
1574         }
1575         else
1576         {
1577                 while (dst < end)
1578                 {
1579                         __m128 v = _mm_load_ps((const float *)src);
1580                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1581                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1582                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1583                         _mm_store_ps(dst, v);
1584                         dst += 4;
1585                         src += stride;
1586                 }
1587         }
1588 }
1589
1590 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1591 {
1592         float *end = dst + size*4;
1593         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1594         if (stride == sizeof(float[2]))
1595         {
1596                 float *end2 = dst + (size&~1)*4;
1597                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1598                 {
1599                         while (dst < end2)
1600                         {
1601                                 __m128 v = _mm_loadu_ps((const float *)src);
1602                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1603                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1604                                 dst += 8;
1605                                 src += 2*sizeof(float[2]);
1606                         }
1607                 }
1608                 else
1609                 {
1610                         while (dst < end2)
1611                         {
1612                                 __m128 v = _mm_load_ps((const float *)src);
1613                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1614                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1615                                 dst += 8;
1616                                 src += 2*sizeof(float[2]);
1617                         }
1618                 }
1619         }
1620         while (dst < end)
1621         {
1622                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1623                 dst += 4;
1624                 src += stride;
1625         }
1626 }
1627
1628 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1629 {
1630         float *end = dst + size*4;
1631         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1632         if (stride == sizeof(unsigned char[4]))
1633         {
1634                 float *end4 = dst + (size&~3)*4;
1635                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1636                 {
1637                         while (dst < end4)
1638                         {
1639                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1640                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1641                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1642                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1643                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1644                                 dst += 16;
1645                                 src += 4*sizeof(unsigned char[4]);
1646                         }
1647                 }
1648                 else
1649                 {
1650                         while (dst < end4)
1651                         {
1652                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1653                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1654                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1655                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1656                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1657                                 dst += 16;
1658                                 src += 4*sizeof(unsigned char[4]);
1659                         }
1660                 }
1661         }
1662         while (dst < end)
1663         {
1664                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1665                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1666                 dst += 4;
1667                 src += stride;
1668         }
1669 }
1670
1671 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1672 {
1673         float *end = dst + 4*size;
1674         __m128 v = _mm_loadu_ps(src);
1675         while (dst < end)
1676         {
1677                 _mm_store_ps(dst, v);
1678                 dst += 4;
1679         }
1680 }
1681 #endif
1682
1683 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1684 {
1685 #ifdef SSE_POSSIBLE
1686         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1687         __m128 m0, m1, m2, m3;
1688         float *end;
1689         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1690         {
1691                 // fast case for identity matrix
1692                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1693                 return;
1694         }
1695         end = out4f + numitems*4;
1696         m0 = _mm_loadu_ps(inmatrix16f);
1697         m1 = _mm_loadu_ps(inmatrix16f + 4);
1698         m2 = _mm_loadu_ps(inmatrix16f + 8);
1699         m3 = _mm_loadu_ps(inmatrix16f + 12);
1700         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1701         {
1702                 while (out4f < end)
1703                 {
1704                         __m128 v = _mm_loadu_ps(in4f);
1705                         _mm_store_ps(out4f,
1706                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1707                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1708                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1709                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1710                         out4f += 4;
1711                         in4f += 4;
1712                 }
1713         }
1714         else
1715         {
1716                 while (out4f < end)
1717                 {
1718                         __m128 v = _mm_load_ps(in4f);
1719                         _mm_store_ps(out4f,
1720                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1721                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1722                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1723                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1724                         out4f += 4;
1725                         in4f += 4;
1726                 }
1727         }
1728 #endif
1729 }
1730
1731 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1732 {
1733         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1734 }
1735
1736 #ifdef SSE_POSSIBLE
1737 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1738 { \
1739         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1740         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1741         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1742         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1743 }
1744
1745 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1746 { \
1747         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1748         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1749         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1750         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1751 }
1752
1753 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1754 { \
1755         __m128 p = (in); \
1756         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1757                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1758                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1759                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1760 }
1761
1762 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1763 {
1764         int clipmask = 0xFF;
1765         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1766         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1767         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1768         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1769         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1770         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1771         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1772         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1773         #define BBFRONT(k, pos) \
1774         { \
1775                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1776                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1777                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1778                 { \
1779                         __m128 proj; \
1780                         clipmask &= ~(1<<k); \
1781                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1782                         minproj = _mm_min_ss(minproj, proj); \
1783                         maxproj = _mm_max_ss(maxproj, proj); \
1784                 } \
1785         }
1786         BBFRONT(0, minpos); 
1787         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1788         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1789         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1790         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1791         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1792         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1793         BBFRONT(7, maxpos);
1794         #define BBCLIP(k) \
1795         { \
1796                 if (clipmask&(1<<k)) \
1797                 { \
1798                         if (!(clipmask&(1<<(k^1)))) \
1799                         { \
1800                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1801                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1802                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1803                                 minproj = _mm_min_ss(minproj, proj); \
1804                                 maxproj = _mm_max_ss(maxproj, proj); \
1805                         } \
1806                         if (!(clipmask&(1<<(k^2)))) \
1807                         { \
1808                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1809                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1810                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1811                                 minproj = _mm_min_ss(minproj, proj); \
1812                                 maxproj = _mm_max_ss(maxproj, proj); \
1813                         } \
1814                         if (!(clipmask&(1<<(k^4)))) \
1815                         { \
1816                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1817                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1818                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1819                                 minproj = _mm_min_ss(minproj, proj); \
1820                                 maxproj = _mm_max_ss(maxproj, proj); \
1821                         } \
1822                 } \
1823         }
1824         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1825         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1826         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1827         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1828         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1829         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1830         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1831         *starty = _mm_cvttss_si32(maxproj);
1832         *endy = _mm_cvttss_si32(minproj)+1;
1833         return clipmask;
1834 }
1835         
1836 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1837 {
1838         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1839         float *end = out4f + numitems*4;
1840         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1841         __m128 minpos, maxpos;
1842         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1843         {
1844                 minpos = maxpos = _mm_loadu_ps(in4f);
1845                 while (out4f < end)
1846                 {
1847                         __m128 v = _mm_loadu_ps(in4f);
1848                         minpos = _mm_min_ps(minpos, v);
1849                         maxpos = _mm_max_ps(maxpos, v);
1850                         _mm_store_ps(out4f, v);
1851                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1852                         _mm_store_ps(screen4f, v);
1853                         in4f += 4;
1854                         out4f += 4;
1855                         screen4f += 4;
1856                 }
1857         }
1858         else
1859         {
1860                 minpos = maxpos = _mm_load_ps(in4f);
1861                 while (out4f < end)
1862                 {
1863                         __m128 v = _mm_load_ps(in4f);
1864                         minpos = _mm_min_ps(minpos, v);
1865                         maxpos = _mm_max_ps(maxpos, v);
1866                         _mm_store_ps(out4f, v);
1867                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1868                         _mm_store_ps(screen4f, v);
1869                         in4f += 4;
1870                         out4f += 4;
1871                         screen4f += 4;
1872                 }
1873         }
1874         if (starty && endy) 
1875         {
1876                 ALIGN(float minposf[4]);
1877                 ALIGN(float maxposf[4]);
1878                 _mm_store_ps(minposf, minpos);
1879                 _mm_store_ps(maxposf, maxpos);
1880                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1881         }
1882         return 0;
1883 }
1884
1885 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1886 {
1887         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1888         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1889         float *end;
1890         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1891                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1892         end = out4f + numitems*4;
1893         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1894         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1895         m0 = _mm_loadu_ps(inmatrix16f);
1896         m1 = _mm_loadu_ps(inmatrix16f + 4);
1897         m2 = _mm_loadu_ps(inmatrix16f + 8);
1898         m3 = _mm_loadu_ps(inmatrix16f + 12);
1899         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1900         {
1901                 minpos = maxpos = _mm_loadu_ps(in4f);
1902                 while (out4f < end)
1903                 {
1904                         __m128 v = _mm_loadu_ps(in4f);
1905                         minpos = _mm_min_ps(minpos, v);
1906                         maxpos = _mm_max_ps(maxpos, v);
1907                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1908                         _mm_store_ps(out4f, v);
1909                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1910                         _mm_store_ps(screen4f, v);
1911                         in4f += 4;
1912                         out4f += 4;
1913                         screen4f += 4;
1914                 }
1915         }
1916         else
1917         {
1918                 minpos = maxpos = _mm_load_ps(in4f);
1919                 while (out4f < end)
1920                 {
1921                         __m128 v = _mm_load_ps(in4f);
1922                         minpos = _mm_min_ps(minpos, v);
1923                         maxpos = _mm_max_ps(maxpos, v);
1924                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1925                         _mm_store_ps(out4f, v);
1926                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1927                         _mm_store_ps(screen4f, v);
1928                         in4f += 4;
1929                         out4f += 4;
1930                         screen4f += 4;
1931                 }
1932         }
1933         if (starty && endy) 
1934         {
1935                 ALIGN(float minposf[4]);
1936                 ALIGN(float maxposf[4]);
1937                 _mm_store_ps(minposf, minpos);
1938                 _mm_store_ps(maxposf, maxpos);
1939                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1940         }
1941         return 0;
1942 }
1943 #endif
1944
1945 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1946 {
1947 #ifdef SSE_POSSIBLE
1948         float *outf = dpsoftrast.post_array4f[outarray];
1949         const unsigned char *inb;
1950         int firstvertex = dpsoftrast.firstvertex;
1951         int numvertices = dpsoftrast.numvertices;
1952         int stride;
1953         switch(inarray)
1954         {
1955         case DPSOFTRAST_ARRAY_POSITION:
1956                 stride = dpsoftrast.stride_vertex;
1957                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1958                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1959                 break;
1960         case DPSOFTRAST_ARRAY_COLOR:
1961                 stride = dpsoftrast.stride_color;
1962                 if (dpsoftrast.pointer_color4f)
1963                 {
1964                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1965                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1966                 }
1967                 else if (dpsoftrast.pointer_color4ub)
1968                 {
1969                         stride = dpsoftrast.stride_color;
1970                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1971                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1972                 }
1973                 else
1974                 {
1975                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1976                 }
1977                 break;
1978         default:
1979                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1980                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1981                 {
1982                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1983                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1984                         {
1985                         case 2:
1986                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1987                                 break;
1988                         case 3:
1989                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1990                                 break;
1991                         case 4:
1992                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1993                                 break;
1994                         }
1995                 }
1996                 break;
1997         }
1998         return outf;
1999 #else
2000         return NULL;
2001 #endif
2002 }
2003
2004 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
2005 {
2006         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2007         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
2008         return data;
2009 }
2010
2011 #if 0
2012 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
2013 {
2014 #ifdef SSE_POSSIBLE
2015         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2016         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2017         return data;
2018 #else
2019         return NULL;
2020 #endif
2021 }
2022 #endif
2023
2024 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2025 {
2026 #ifdef SSE_POSSIBLE
2027         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2028         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2029         return data;
2030 #else
2031         return NULL;
2032 #endif
2033 }
2034
2035 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2036 {
2037         int x;
2038         int startx = span->startx;
2039         int endx = span->endx;
2040         float wslope = triangle->w[0];
2041         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2042         float endz = 1.0f / (w + wslope * startx);
2043         if (triangle->w[0] == 0)
2044         {
2045                 // LordHavoc: fast flat polygons (HUD/menu)
2046                 for (x = startx;x < endx;x++)
2047                         zf[x] = endz;
2048                 return;
2049         }
2050         for (x = startx;x < endx;)
2051         {
2052                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2053                 float z = endz, dz;
2054                 if (nextsub >= endx) nextsub = endsub = endx-1;
2055                 endz = 1.0f / (w + wslope * nextsub);
2056                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2057                 for (; x <= endsub; x++, z += dz)
2058                         zf[x] = z;
2059         }
2060 }
2061
2062 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2063 {
2064 #ifdef SSE_POSSIBLE
2065         int x;
2066         int startx = span->startx;
2067         int endx = span->endx;
2068         int maskx;
2069         int subx;
2070         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2071         unsigned char * RESTRICT pixelmask = span->pixelmask;
2072         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2073         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2074         if (!pixel)
2075                 return;
2076         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2077         pixeli += span->y * dpsoftrast.fb_width + span->x;
2078         // handle alphatest now (this affects depth writes too)
2079         if (thread->alphatest)
2080                 for (x = startx;x < endx;x++)
2081                         if (in4ub[x*4+3] < 128)
2082                                 pixelmask[x] = false;
2083         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2084         // helps sprites, text and hud artwork
2085         switch(thread->fb_blendmode)
2086         {
2087         case DPSOFTRAST_BLENDMODE_ALPHA:
2088         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2089         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2090                 maskx = startx;
2091                 for (x = startx;x < endx;x++)
2092                 {
2093                         if (in4ub[x*4+3] >= 1)
2094                         {
2095                                 startx = x;
2096                                 for (;;)
2097                                 {
2098                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2099                                         maskx = x;
2100                                         if (x >= endx) break;
2101                                         ++x;
2102                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2103                                         if (x >= endx) break;
2104                                 }
2105                                 break;
2106                         }
2107                 }
2108                 endx = maskx;
2109                 break;
2110         case DPSOFTRAST_BLENDMODE_OPAQUE:
2111         case DPSOFTRAST_BLENDMODE_ADD:
2112         case DPSOFTRAST_BLENDMODE_INVMOD:
2113         case DPSOFTRAST_BLENDMODE_MUL:
2114         case DPSOFTRAST_BLENDMODE_MUL2:
2115         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2116         case DPSOFTRAST_BLENDMODE_INVADD:
2117                 break;
2118         }
2119         // put some special values at the end of the mask to ensure the loops end
2120         pixelmask[endx] = 1;
2121         pixelmask[endx+1] = 0;
2122         // LordHavoc: use a double loop to identify subspans, this helps the
2123         // optimized copy/blend loops to perform at their best, most triangles
2124         // have only one run of pixels, and do the search using wide reads...
2125         x = startx;
2126         while (x < endx)
2127         {
2128                 // if this pixel is masked off, it's probably not alone...
2129                 if (!pixelmask[x])
2130                 {
2131                         x++;
2132 #if 1
2133                         if (x + 8 < endx)
2134                         {
2135                                 // the 4-item search must be aligned or else it stalls badly
2136                                 if ((x & 3) && !pixelmask[x]) 
2137                                 {
2138                                         if(pixelmask[x]) goto endmasked;
2139                                         x++;
2140                                         if (x & 3)
2141                                         {
2142                                                 if(pixelmask[x]) goto endmasked;
2143                                                 x++;
2144                                                 if (x & 3)
2145                                                 {
2146                                                         if(pixelmask[x]) goto endmasked;
2147                                                         x++;
2148                                                 }
2149                                         }
2150                                 }
2151                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2152                                         x += 4;
2153                         }
2154 #endif
2155                         for (;!pixelmask[x];x++)
2156                                 ;
2157                         // rather than continue the loop, just check the end variable
2158                         if (x >= endx)
2159                                 break;
2160                 }
2161         endmasked:
2162                 // find length of subspan
2163                 subx = x + 1;
2164 #if 1
2165                 if (subx + 8 < endx)
2166                 {
2167                         if (subx & 3)
2168                         {
2169                                 if(!pixelmask[subx]) goto endunmasked;
2170                                 subx++;
2171                                 if (subx & 3)
2172                                 {
2173                                         if(!pixelmask[subx]) goto endunmasked;
2174                                         subx++;
2175                                         if (subx & 3)
2176                                         {
2177                                                 if(!pixelmask[subx]) goto endunmasked;
2178                                                 subx++;
2179                                         }
2180                                 }
2181                         }
2182                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2183                                 subx += 4;
2184                 }
2185 #endif
2186                 for (;pixelmask[subx];subx++)
2187                         ;
2188                 // the checks can overshoot, so make sure to clip it...
2189                 if (subx > endx)
2190                         subx = endx;
2191         endunmasked:
2192                 // now that we know the subspan length...  process!
2193                 switch(thread->fb_blendmode)
2194                 {
2195                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2196 #if 0
2197                         if (subx - x >= 16)
2198                         {
2199                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2200                                 x = subx;
2201                         }
2202                         else
2203 #elif 1
2204                         while (x + 16 <= subx)
2205                         {
2206                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2207                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2208                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2209                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2210                                 x += 16;
2211                         }
2212 #endif
2213                         {
2214                                 while (x + 4 <= subx)
2215                                 {
2216                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2217                                         x += 4;
2218                                 }
2219                                 if (x + 2 <= subx)
2220                                 {
2221                                         pixeli[x] = ini[x];
2222                                         pixeli[x+1] = ini[x+1];
2223                                         x += 2;
2224                                 }
2225                                 if (x < subx)
2226                                 {
2227                                         pixeli[x] = ini[x];
2228                                         x++;
2229                                 }
2230                         }
2231                         break;
2232                 case DPSOFTRAST_BLENDMODE_ALPHA:
2233                 #define FINISHBLEND(blend2, blend1) \
2234                         for (;x + 1 < subx;x += 2) \
2235                         { \
2236                                 __m128i src, dst; \
2237                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2238                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2239                                 blend2; \
2240                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2241                         } \
2242                         if (x < subx) \
2243                         { \
2244                                 __m128i src, dst; \
2245                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2246                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2247                                 blend1; \
2248                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2249                                 x++; \
2250                         }
2251                         FINISHBLEND({
2252                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2253                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2254                         }, {
2255                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2256                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2257                         });
2258                         break;
2259                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2260                         FINISHBLEND({
2261                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2263                         }, {
2264                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2266                         });
2267                         break;
2268                 case DPSOFTRAST_BLENDMODE_ADD:
2269                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2270                         break;
2271                 case DPSOFTRAST_BLENDMODE_INVMOD:
2272                         FINISHBLEND({
2273                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2274                         }, {
2275                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2276                         });
2277                         break;
2278                 case DPSOFTRAST_BLENDMODE_MUL:
2279                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2280                         break;
2281                 case DPSOFTRAST_BLENDMODE_MUL2:
2282                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2283                         break;
2284                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2285                         FINISHBLEND({
2286                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2287                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2288                         }, {
2289                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2290                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2291                         });
2292                         break;
2293                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2294                         FINISHBLEND({
2295                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2296                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2297                         }, {
2298                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2299                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2300                         });
2301                         break;
2302                 case DPSOFTRAST_BLENDMODE_INVADD:
2303                         FINISHBLEND({
2304                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2305                         }, {
2306                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2307                         });
2308                         break;
2309                 }
2310         }
2311 #endif
2312 }
2313
2314 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2315         // warning: this is SLOW, only use if the optimized per-span functions won't do
2316 {
2317         const unsigned char * RESTRICT pixelbase;
2318         const unsigned char * RESTRICT pixel[4];
2319         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2320         int wrapmask[2] = { width-1, height-1 };
2321         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2322         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2323         {
2324                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2325                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2326                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2327                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2328                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2329                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2330                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2331                 {
2332                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2333                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2334                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2335                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2336                 }
2337                 else
2338                 {
2339                         tci[0] &= wrapmask[0];
2340                         tci[1] &= wrapmask[1];
2341                         tci1[0] &= wrapmask[0];
2342                         tci1[1] &= wrapmask[1];
2343                 }
2344                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2345                 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2346                 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2347                 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2348                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2349                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2350                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2351                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2352         }
2353         else
2354         {
2355                 int tci[2] = { x * width, y * height };
2356                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2357                 {
2358                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2359                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2360                 }
2361                 else
2362                 {
2363                         tci[0] &= wrapmask[0];
2364                         tci[1] &= wrapmask[1];
2365                 }
2366                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2367                 c[0] = pixel[0][0];
2368                 c[1] = pixel[0][1];
2369                 c[2] = pixel[0][2];
2370                 c[3] = pixel[0][3];
2371         }
2372 }
2373
2374 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2375 {
2376         int x;
2377         int startx = span->startx;
2378         int endx = span->endx;
2379         int flags;
2380         float c[4];
2381         float data[4];
2382         float slope[4];
2383         float tc[2], endtc[2];
2384         float tcscale[2];
2385         unsigned int tci[2];
2386         unsigned int tci1[2];
2387         unsigned int tcimin[2];
2388         unsigned int tcimax[2];
2389         int tciwrapmask[2];
2390         int tciwidth;
2391         int filter;
2392         int mip;
2393         const unsigned char * RESTRICT pixelbase;
2394         const unsigned char * RESTRICT pixel[4];
2395         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2396         // if no texture is bound, just fill it with white
2397         if (!texture)
2398         {
2399                 for (x = startx;x < endx;x++)
2400                 {
2401                         out4f[x*4+0] = 1.0f;
2402                         out4f[x*4+1] = 1.0f;
2403                         out4f[x*4+2] = 1.0f;
2404                         out4f[x*4+3] = 1.0f;
2405                 }
2406                 return;
2407         }
2408         mip = triangle->mip[texunitindex];
2409         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2410         // if this mipmap of the texture is 1 pixel, just fill it with that color
2411         if (texture->mipmap[mip][1] == 4)
2412         {
2413                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2414                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2415                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2416                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2417                 for (x = startx;x < endx;x++)
2418                 {
2419                         out4f[x*4+0] = c[0];
2420                         out4f[x*4+1] = c[1];
2421                         out4f[x*4+2] = c[2];
2422                         out4f[x*4+3] = c[3];
2423                 }
2424                 return;
2425         }
2426         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2427         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2428         flags = texture->flags;
2429         tcscale[0] = texture->mipmap[mip][2];
2430         tcscale[1] = texture->mipmap[mip][3];
2431         tciwidth = texture->mipmap[mip][2];
2432         tcimin[0] = 0;
2433         tcimin[1] = 0;
2434         tcimax[0] = texture->mipmap[mip][2]-1;
2435         tcimax[1] = texture->mipmap[mip][3]-1;
2436         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2437         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2438         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2439         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2440         if (filter)
2441         {
2442                 endtc[0] -= 0.5f;
2443                 endtc[1] -= 0.5f;
2444         }
2445         for (x = startx;x < endx;)
2446         {
2447                 unsigned int subtc[2];
2448                 unsigned int substep[2];
2449                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2450                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2451                 if (nextsub >= endx)
2452                 {
2453                         nextsub = endsub = endx-1;      
2454                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2455                 }
2456                 tc[0] = endtc[0];
2457                 tc[1] = endtc[1];
2458                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2459                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2460                 if (filter)
2461                 {
2462                         endtc[0] -= 0.5f;
2463                         endtc[1] -= 0.5f;
2464                 }
2465                 substep[0] = (endtc[0] - tc[0]) * subscale;
2466                 substep[1] = (endtc[1] - tc[1]) * subscale;
2467                 subtc[0] = tc[0] * (1<<12);
2468                 subtc[1] = tc[1] * (1<<12);
2469                 if (filter)
2470                 {
2471                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2472                         {
2473                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2474                                 {
2475                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2476                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2477                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2478                                         tci[0] = subtc[0]>>12;
2479                                         tci[1] = subtc[1]>>12;
2480                                         tci1[0] = tci[0] + 1;
2481                                         tci1[1] = tci[1] + 1;
2482                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2483                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2484                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2485                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2486                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2487                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2488                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2489                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2490                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2491                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2492                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2493                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2494                                         out4f[x*4+0] = c[0];
2495                                         out4f[x*4+1] = c[1];
2496                                         out4f[x*4+2] = c[2];
2497                                         out4f[x*4+3] = c[3];
2498                                 }
2499                         }
2500                         else
2501                         {
2502                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2503                                 {
2504                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2505                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2506                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2507                                         tci[0] = subtc[0]>>12;
2508                                         tci[1] = subtc[1]>>12;
2509                                         tci1[0] = tci[0] + 1;
2510                                         tci1[1] = tci[1] + 1;
2511                                         tci[0] &= tciwrapmask[0];
2512                                         tci[1] &= tciwrapmask[1];
2513                                         tci1[0] &= tciwrapmask[0];
2514                                         tci1[1] &= tciwrapmask[1];
2515                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2516                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2517                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2518                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2519                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2520                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2521                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2522                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2523                                         out4f[x*4+0] = c[0];
2524                                         out4f[x*4+1] = c[1];
2525                                         out4f[x*4+2] = c[2];
2526                                         out4f[x*4+3] = c[3];
2527                                 }
2528                         }
2529                 }
2530                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2531                 {
2532                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2533                         {
2534                                 tci[0] = subtc[0]>>12;
2535                                 tci[1] = subtc[1]>>12;
2536                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2537                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2538                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2539                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2540                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2541                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2542                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2543                                 out4f[x*4+0] = c[0];
2544                                 out4f[x*4+1] = c[1];
2545                                 out4f[x*4+2] = c[2];
2546                                 out4f[x*4+3] = c[3];
2547                         }
2548                 }
2549                 else
2550                 {
2551                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2552                         {
2553                                 tci[0] = subtc[0]>>12;
2554                                 tci[1] = subtc[1]>>12;
2555                                 tci[0] &= tciwrapmask[0];
2556                                 tci[1] &= tciwrapmask[1];
2557                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2558                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2559                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2560                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2561                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2562                                 out4f[x*4+0] = c[0];
2563                                 out4f[x*4+1] = c[1];
2564                                 out4f[x*4+2] = c[2];
2565                                 out4f[x*4+3] = c[3];
2566                         }
2567                 }
2568         }
2569 }
2570
2571 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2572 {
2573 #ifdef SSE_POSSIBLE
2574         int x;
2575         int startx = span->startx;
2576         int endx = span->endx;
2577         int flags;
2578         __m128 data, slope, tcscale;
2579         __m128i tcsize, tcmask, tcoffset, tcmax;
2580         __m128 tc, endtc;
2581         __m128i subtc, substep, endsubtc;
2582         int filter;
2583         int mip;
2584         int affine; // LordHavoc: optimized affine texturing case
2585         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2586         const unsigned char * RESTRICT pixelbase;
2587         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2588         // if no texture is bound, just fill it with white
2589         if (!texture)
2590         {
2591                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2592                 return;
2593         }
2594         mip = triangle->mip[texunitindex];
2595         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2596         // if this mipmap of the texture is 1 pixel, just fill it with that color
2597         if (texture->mipmap[mip][1] == 4)
2598         {
2599                 unsigned int k = *((const unsigned int *)pixelbase);
2600                 for (x = startx;x < endx;x++)
2601                         outi[x] = k;
2602                 return;
2603         }
2604         affine = zf[startx] == zf[endx-1];
2605         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2606         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2607         flags = texture->flags;
2608         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2609         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2610         tcscale = _mm_cvtepi32_ps(tcsize);
2611         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2612         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2613         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2614         if (filter)
2615                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2616         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2617         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2618         tcmax = _mm_packs_epi32(tcmask, tcmask);
2619         for (x = startx;x < endx;)
2620         {
2621                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2622                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2623                 if (nextsub >= endx || affine)
2624                 {
2625                         nextsub = endsub = endx-1;
2626                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2627                 }       
2628                 tc = endtc;
2629                 subtc = endsubtc;
2630                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2631                 if (filter)
2632                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2633                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2634                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2635                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2636                 substep = _mm_slli_epi32(substep, 1);
2637                 if (filter)
2638                 {
2639                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2640                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2641                         {
2642                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2643                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2644                                 {
2645                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2646                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2647                                         tci = _mm_madd_epi16(tci, tcoffset);
2648                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2649                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2650                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2651                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2652                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2653                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2654                                         fracm = _mm_srli_epi16(subtc, 1);
2655                                         pix1 = _mm_add_epi16(pix1,
2656                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2657                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2658                                         pix3 = _mm_add_epi16(pix3,
2659                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2660                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2661                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2662                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2663                                         pix2 = _mm_add_epi16(pix2,
2664                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2665                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2666                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2667                                 }
2668                                 if (x <= endsub)
2669                                 {
2670                                         const unsigned char * RESTRICT ptr1;
2671                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2672                                         tci = _mm_madd_epi16(tci, tcoffset);
2673                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2674                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2675                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2676                                         fracm = _mm_srli_epi16(subtc, 1);
2677                                         pix1 = _mm_add_epi16(pix1,
2678                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2679                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2680                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2681                                         pix1 = _mm_add_epi16(pix1,
2682                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2683                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2684                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2685                                         x++;
2686                                 }
2687                         }
2688                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2689                         {
2690                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2691                                 {
2692                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2693                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2694                                         tci = _mm_madd_epi16(tci, tcoffset);
2695                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2696                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2697                                                                                         _mm_setzero_si128());
2698                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2699                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2700                                                                                         _mm_setzero_si128());
2701                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2702                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2703                                         tci = _mm_madd_epi16(tci, tcoffset);
2704                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2705                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2706                                                                                         _mm_setzero_si128());
2707                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2708                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2709                                                                                         _mm_setzero_si128());
2710                                         fracm = _mm_srli_epi16(subtc, 1);
2711                                         pix1 = _mm_add_epi16(pix1,
2712                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2713                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2714                                         pix3 = _mm_add_epi16(pix3,
2715                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2716                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2717                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2718                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2719                                         pix2 = _mm_add_epi16(pix2,
2720                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2721                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2722                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2723                                 }
2724                                 if (x <= endsub)
2725                                 {
2726                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2727                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2728                                         tci = _mm_madd_epi16(tci, tcoffset);
2729                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2730                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2731                                                                                         _mm_setzero_si128());
2732                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2733                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2734                                                                                         _mm_setzero_si128());
2735                                         fracm = _mm_srli_epi16(subtc, 1);
2736                                         pix1 = _mm_add_epi16(pix1,
2737                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2738                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2739                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2740                                         pix1 = _mm_add_epi16(pix1,
2741                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2742                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2743                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2744                                         x++;
2745                                 }
2746                         }
2747                         else
2748                         {
2749                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2750                                 {
2751                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2752                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2753                                         tci = _mm_madd_epi16(tci, tcoffset);
2754                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2755                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2756                                                                                         _mm_setzero_si128());
2757                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2758                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2759                                                                                         _mm_setzero_si128());
2760                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2761                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2762                                         tci = _mm_madd_epi16(tci, tcoffset);
2763                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2764                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2765                                                                                         _mm_setzero_si128());
2766                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2767                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2768                                                                                         _mm_setzero_si128());
2769                                         fracm = _mm_srli_epi16(subtc, 1);
2770                                         pix1 = _mm_add_epi16(pix1,
2771                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2772                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2773                                         pix3 = _mm_add_epi16(pix3,
2774                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2775                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2776                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2777                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2778                                         pix2 = _mm_add_epi16(pix2,
2779                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2780                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2781                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2782                                 }
2783                                 if (x <= endsub)
2784                                 {
2785                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2786                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2787                                         tci = _mm_madd_epi16(tci, tcoffset);
2788                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2789                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2790                                                                                         _mm_setzero_si128());
2791                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2792                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2793                                                                                         _mm_setzero_si128());
2794                                         fracm = _mm_srli_epi16(subtc, 1);
2795                                         pix1 = _mm_add_epi16(pix1,
2796                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2797                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2798                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2799                                         pix1 = _mm_add_epi16(pix1,
2800                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2801                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2802                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2803                                         x++;
2804                                 }
2805                         }
2806                 }
2807                 else
2808                 {
2809                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2810                         {
2811                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2812                                 {
2813                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2814                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2815                                         tci = _mm_madd_epi16(tci, tcoffset);
2816                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2817                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2818                                 }
2819                                 if (x <= endsub)
2820                                 {
2821                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2822                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2823                                         tci = _mm_madd_epi16(tci, tcoffset);
2824                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2825                                         x++;
2826                                 }
2827                         }
2828                         else
2829                         {
2830                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2831                                 {
2832                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2833                                         tci = _mm_and_si128(tci, tcmax); 
2834                                         tci = _mm_madd_epi16(tci, tcoffset);
2835                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2836                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2837                                 }
2838                                 if (x <= endsub)
2839                                 {
2840                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2841                                         tci = _mm_and_si128(tci, tcmax); 
2842                                         tci = _mm_madd_epi16(tci, tcoffset);
2843                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2844                                         x++;
2845                                 }
2846                         }
2847                 }
2848         }
2849 #endif
2850 }
2851
2852 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2853 {
2854         // TODO: IMPLEMENT
2855         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2856 }
2857
2858 float DPSOFTRAST_SampleShadowmap(const float *vector)
2859 {
2860         // TODO: IMPLEMENT
2861         return 1.0f;
2862 }
2863
2864 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2865 {
2866         int x;
2867         int startx = span->startx;
2868         int endx = span->endx;
2869         float c[4];
2870         float data[4];
2871         float slope[4];
2872         float z;
2873         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2874         for (x = startx;x < endx;x++)
2875         {
2876                 z = zf[x];
2877                 c[0] = (data[0] + slope[0]*x) * z;
2878                 c[1] = (data[1] + slope[1]*x) * z;
2879                 c[2] = (data[2] + slope[2]*x) * z;
2880                 c[3] = (data[3] + slope[3]*x) * z;
2881                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2882                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2883                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2884                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2885         }
2886 }
2887
2888 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2889 {
2890         int x;
2891         int startx = span->startx;
2892         int endx = span->endx;
2893         float c[4];
2894         float data[4];
2895         float slope[4];
2896         float z;
2897         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2898         for (x = startx;x < endx;x++)
2899         {
2900                 z = zf[x];
2901                 c[0] = (data[0] + slope[0]*x) * z;
2902                 c[1] = (data[1] + slope[1]*x) * z;
2903                 c[2] = (data[2] + slope[2]*x) * z;
2904                 c[3] = (data[3] + slope[3]*x) * z;
2905                 out4f[x*4+0] = c[0];
2906                 out4f[x*4+1] = c[1];
2907                 out4f[x*4+2] = c[2];
2908                 out4f[x*4+3] = c[3];
2909         }
2910 }
2911
2912 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2913 {
2914         int x, startx = span->startx, endx = span->endx;
2915         float c[4], localcolor[4];
2916         localcolor[0] = subcolor[0];
2917         localcolor[1] = subcolor[1];
2918         localcolor[2] = subcolor[2];
2919         localcolor[3] = subcolor[3];
2920         for (x = startx;x < endx;x++)
2921         {
2922                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2923                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2924                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2925                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2926                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2927                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2928                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2929                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2930         }
2931 }
2932
2933 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2934 {
2935         int x, startx = span->startx, endx = span->endx;
2936         for (x = startx;x < endx;x++)
2937         {
2938                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2939                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2940                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2941                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2942         }
2943 }
2944
2945 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2946 {
2947         int x, startx = span->startx, endx = span->endx;
2948         for (x = startx;x < endx;x++)
2949         {
2950                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2951                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2952                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2953                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2954         }
2955 }
2956
2957 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2958 {
2959         int x, startx = span->startx, endx = span->endx;
2960         float a, b;
2961         for (x = startx;x < endx;x++)
2962         {
2963                 a = 1.0f - inb4f[x*4+3];
2964                 b = inb4f[x*4+3];
2965                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2966                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2967                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2968                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2969         }
2970 }
2971
2972 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2973 {
2974         int x, startx = span->startx, endx = span->endx;
2975         float localcolor[4], ilerp, lerp;
2976         localcolor[0] = color[0];
2977         localcolor[1] = color[1];
2978         localcolor[2] = color[2];
2979         localcolor[3] = color[3];
2980         ilerp = 1.0f - localcolor[3];
2981         lerp = localcolor[3];
2982         for (x = startx;x < endx;x++)
2983         {
2984                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2985                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2986                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2987                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2988         }
2989 }
2990
2991
2992
2993 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2994 {
2995 #ifdef SSE_POSSIBLE
2996         int x;
2997         int startx = span->startx;
2998         int endx = span->endx;
2999         __m128 data, slope;
3000         __m128 mod, endmod;
3001         __m128i submod, substep, endsubmod;
3002         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3003         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3004         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3005         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3006         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3007         for (x = startx; x < endx;)
3008         {
3009                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3010                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3011                 if (nextsub >= endx)
3012                 {
3013                         nextsub = endsub = endx-1;
3014                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3015                 }
3016                 mod = endmod;
3017                 submod = endsubmod;
3018                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3019                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3020                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3021                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3022                 substep = _mm_packs_epi32(substep, substep);
3023                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3024                 {
3025                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3026                         pix = _mm_mulhi_epu16(pix, submod);
3027                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3028                 }
3029                 if (x <= endsub)
3030                 {
3031                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3032                         pix = _mm_mulhi_epu16(pix, submod);
3033                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3034                         x++;
3035                 }
3036         }
3037 #endif
3038 }
3039
3040 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3041 {
3042 #ifdef SSE_POSSIBLE
3043         int x;
3044         int startx = span->startx;
3045         int endx = span->endx;
3046         __m128 data, slope;
3047         __m128 mod, endmod;
3048         __m128i submod, substep, endsubmod;
3049         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3050         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3051         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3052         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3053         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3054         for (x = startx; x < endx;)
3055         {
3056                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3057                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3058                 if (nextsub >= endx)
3059                 {
3060                         nextsub = endsub = endx-1;
3061                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3062                 }
3063                 mod = endmod;
3064                 submod = endsubmod;
3065                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3066                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3067                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3068                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3069                 substep = _mm_packs_epi32(substep, substep);
3070                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3071                 {
3072                         __m128i pix = _mm_srai_epi16(submod, 4);
3073                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3074                 }
3075                 if (x <= endsub)
3076                 {
3077                         __m128i pix = _mm_srai_epi16(submod, 4);
3078                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3079                         x++;
3080                 }
3081         }
3082 #endif
3083 }
3084
3085 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3086 {
3087 #ifdef SSE_POSSIBLE
3088         int x, startx = span->startx, endx = span->endx;
3089         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3090         localcolor = _mm_packs_epi32(localcolor, localcolor);
3091         for (x = startx;x+2 <= endx;x+=2)
3092         {
3093                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3094                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3095                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3096                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3097         }
3098         if (x < endx)
3099         {
3100                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3101                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3102                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3103                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3104         }
3105 #endif
3106 }
3107
3108 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3109 {
3110 #ifdef SSE_POSSIBLE
3111         int x, startx = span->startx, endx = span->endx;
3112         for (x = startx;x+2 <= endx;x+=2)
3113         {
3114                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3115                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3116                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3117                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3118         }
3119         if (x < endx)
3120         {
3121                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3122                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3123                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3124                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3125         }
3126 #endif
3127 }
3128
3129 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3130 {
3131 #ifdef SSE_POSSIBLE
3132         int x, startx = span->startx, endx = span->endx;
3133         for (x = startx;x+2 <= endx;x+=2)
3134         {
3135                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3136                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3137                 pix1 = _mm_add_epi16(pix1, pix2);
3138                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3139         }
3140         if (x < endx)
3141         {
3142                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3143                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3144                 pix1 = _mm_add_epi16(pix1, pix2);
3145                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3146         }
3147 #endif
3148 }
3149
3150 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3151 {
3152 #ifdef SSE_POSSIBLE
3153         int x, startx = span->startx, endx = span->endx;
3154         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3155         tint = _mm_packs_epi32(tint, tint);
3156         for (x = startx;x+2 <= endx;x+=2)
3157         {
3158                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3159                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3160                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3161                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3162         }
3163         if (x < endx)
3164         {
3165                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3166                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3167                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3168                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3169         }
3170 #endif
3171 }
3172
3173 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3174 {
3175 #ifdef SSE_POSSIBLE
3176         int x, startx = span->startx, endx = span->endx;
3177         for (x = startx;x+2 <= endx;x+=2)
3178         {
3179                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3180                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3181                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3182                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3183                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3184         }
3185         if (x < endx)
3186         {
3187                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3188                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3189                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3190                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3191                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3192         }
3193 #endif
3194 }
3195
3196 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3197 {
3198 #ifdef SSE_POSSIBLE
3199         int x, startx = span->startx, endx = span->endx;
3200         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3201         localcolor = _mm_packs_epi32(localcolor, localcolor);
3202         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3203         for (x = startx;x+2 <= endx;x+=2)
3204         {
3205                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3206                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3207                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3208         }
3209         if (x < endx)
3210         {
3211                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3212                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3213                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3214         }
3215 #endif
3216 }
3217
3218
3219
3220 void DPSOFTRAST_VertexShader_Generic(void)
3221 {
3222         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3223         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3224         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3225         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3226                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3227 }
3228
3229 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3230 {
3231         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3232         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3233         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3234         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3235         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3236         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3237         {
3238                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3239                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3240                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3241                 {
3242                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3243                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3244                         {
3245                                 // multiply
3246                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3247                         }
3248                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3249                         {
3250                                 // add
3251                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3252                         }
3253                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3254                         {
3255                                 // alphablend
3256                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3257                         }
3258                 }
3259         }
3260         else
3261                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3262         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3263 }
3264
3265
3266
3267 void DPSOFTRAST_VertexShader_PostProcess(void)
3268 {
3269         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3270         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3271         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3272 }
3273
3274 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3275 {
3276         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3277         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3278         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3279         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3280         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3281         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3282         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3283         {
3284                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3285                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3286         }
3287         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3288         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3289         {
3290                 // TODO: implement saturation
3291         }
3292         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3293         {
3294                 // TODO: implement gammaramps
3295         }
3296         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3297 }
3298
3299
3300
3301 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3302 {
3303         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3304 }
3305
3306 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3307 {
3308         // this is never called (because colormask is off when this shader is used)
3309         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3310         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3311         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3312         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3313         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3314 }
3315
3316
3317
3318 void DPSOFTRAST_VertexShader_FlatColor(void)
3319 {
3320         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3321         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3322 }
3323
3324 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3325 {
3326 #ifdef SSE_POSSIBLE
3327         unsigned char * RESTRICT pixelmask = span->pixelmask;
3328         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3329         int x, startx = span->startx, endx = span->endx;
3330         __m128i Color_Ambientm;
3331         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3332         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3333         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3334         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3335         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3336         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3337                 pixel = buffer_FragColorbgra8;
3338         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3339         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3340         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3341         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3342         for (x = startx;x < endx;x++)
3343         {
3344                 __m128i color, pix;
3345                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3346                 {
3347                         __m128i pix2;
3348                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3349                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3350                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3351                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3352                         x += 3;
3353                         continue;
3354                 }
3355                 if (!pixelmask[x])
3356                         continue;
3357                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3358                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3359                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3360         }
3361         if (pixel == buffer_FragColorbgra8)
3362                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3363 #endif
3364 }
3365
3366
3367
3368 void DPSOFTRAST_VertexShader_VertexColor(void)
3369 {
3370         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3371         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3372         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3373 }
3374
3375 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3376 {
3377 #ifdef SSE_POSSIBLE
3378         unsigned char * RESTRICT pixelmask = span->pixelmask;
3379         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3380         int x, startx = span->startx, endx = span->endx;
3381         __m128i Color_Ambientm, Color_Diffusem;
3382         __m128 data, slope;
3383         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3384         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3385         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3386         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3387         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3388         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3389         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3390                 pixel = buffer_FragColorbgra8;
3391         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3392         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3393         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3394         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3395         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3396         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3397         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3398         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3399         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3400         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3401         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3402         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3403         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3404         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3405         {
3406                 __m128i color, mod, pix;
3407                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3408                 {
3409                         __m128i pix2, mod2;
3410                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3411                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3412                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3413                         data = _mm_add_ps(data, slope);
3414                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3415                         data = _mm_add_ps(data, slope);
3416                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3417                         data = _mm_add_ps(data, slope);
3418                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3419                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3420                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3421                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3422                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3423                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3424                         x += 3;
3425                         continue;
3426                 }
3427                 if (!pixelmask[x])
3428                         continue;
3429                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3430                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3431                 mod = _mm_packs_epi32(mod, mod);
3432                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3433                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3434         }
3435         if (pixel == buffer_FragColorbgra8)
3436                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3437 #endif
3438 }
3439
3440
3441
3442 void DPSOFTRAST_VertexShader_Lightmap(void)
3443 {
3444         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3445         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3446         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3447 }
3448
3449 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3450 {
3451 #ifdef SSE_POSSIBLE
3452         unsigned char * RESTRICT pixelmask = span->pixelmask;
3453         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3454         int x, startx = span->startx, endx = span->endx;
3455         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3456         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3457         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3458         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3459         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3460         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3461         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3462         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3463         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3464         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3465                 pixel = buffer_FragColorbgra8;
3466         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3467         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3468         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3469         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3470         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3471         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3472         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3473         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3474         {
3475                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3476                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3477                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3478                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3479                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3480                 for (x = startx;x < endx;x++)
3481                 {
3482                         __m128i color, lightmap, glow, pix;
3483                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3484                         {
3485                                 __m128i pix2;
3486                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3487                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3488                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3489                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3490                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3491                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3492                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3493                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3494                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3495                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3496                                 x += 3;
3497                                 continue;
3498                         }
3499                         if (!pixelmask[x])
3500                                 continue;
3501                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3502                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3503                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3504                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3505                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3506                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3507                 }
3508         }
3509         else
3510         {
3511                 for (x = startx;x < endx;x++)
3512                 {
3513                         __m128i color, lightmap, pix;
3514                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3515                         {
3516                                 __m128i pix2;
3517                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3518                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3519                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3520                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3521                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3522                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3523                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3524                                 x += 3;
3525                                 continue;
3526                         }
3527                         if (!pixelmask[x]) 
3528                                 continue;
3529                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3530                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3531                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3532                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3533                 }
3534         }
3535         if (pixel == buffer_FragColorbgra8)
3536                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3537 #endif
3538 }
3539
3540
3541 void DPSOFTRAST_VertexShader_LightDirection(void);
3542 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3543
3544 void DPSOFTRAST_VertexShader_FakeLight(void)
3545 {
3546         DPSOFTRAST_VertexShader_LightDirection();
3547 }
3548
3549 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3550 {
3551         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3552 }
3553
3554
3555
3556 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3557 {
3558         DPSOFTRAST_VertexShader_LightDirection();
3559         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3560 }
3561
3562 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3563 {
3564         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3565 }
3566
3567
3568
3569 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3570 {
3571         DPSOFTRAST_VertexShader_LightDirection();
3572         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3573 }
3574
3575 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3576 {
3577         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3578 }
3579
3580
3581
3582 void DPSOFTRAST_VertexShader_LightDirection(void)
3583 {
3584         int i;
3585         int numvertices = dpsoftrast.numvertices;
3586         float LightDir[4];
3587         float LightVector[4];
3588         float EyePosition[4];
3589         float EyeVectorModelSpace[4];
3590         float EyeVector[4];
3591         float position[4];
3592         float svector[4];
3593         float tvector[4];
3594         float normal[4];
3595         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3596         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3597         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3598         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3599         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3600         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3601         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3602         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3603         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3604         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3605         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3606         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3607         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3608         for (i = 0;i < numvertices;i++)
3609         {
3610                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3611                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3612                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3613                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3614                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3615                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3616                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3617                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3618                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3619                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3620                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3621                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3622                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3623                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3624                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3625                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3626                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3627                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3628                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3629                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3630                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3631                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3632                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3633                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3634                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3635                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3636                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3637                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3638                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3639         }
3640         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3641 }
3642
3643 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3644 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3645 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3646 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3647 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3648 #define DPSOFTRAST_Vector3Normalize(v)\
3649 do\
3650 {\
3651         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3652         if (len)\
3653         {\
3654                 len = 1.0f / len;\
3655                 v[0] *= len;\
3656                 v[1] *= len;\
3657                 v[2] *= len;\
3658         }\
3659 }\
3660 while(0)
3661
3662 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3663 {
3664         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3665         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3666         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3667         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3668         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3669         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3670         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3671         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3672         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3673         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3674         int x, startx = span->startx, endx = span->endx;
3675         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3676         float LightVectordata[4];
3677         float LightVectorslope[4];
3678         float EyeVectordata[4];
3679         float EyeVectorslope[4];
3680         float VectorSdata[4];
3681         float VectorSslope[4];
3682         float VectorTdata[4];
3683         float VectorTslope[4];
3684         float VectorRdata[4];
3685         float VectorRslope[4];
3686         float z;
3687         float diffusetex[4];
3688         float glosstex[4];
3689         float surfacenormal[4];
3690         float lightnormal[4];
3691         float lightnormal_modelspace[4];
3692         float eyenormal[4];
3693         float specularnormal[4];
3694         float diffuse;
3695         float specular;
3696         float SpecularPower;
3697         int d[4];
3698         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3699         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3700         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3701         Color_Glow[3] = 0.0f;
3702         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3703         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3704         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3705         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3706         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3707         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3708         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3709         Color_Pants[3] = 0.0f;
3710         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3711         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3712         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3713         Color_Shirt[3] = 0.0f;
3714         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3715         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3716         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3717         {
3718                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3719                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3720         }
3721         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3722         {
3723                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3724         }
3725         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3726         {
3727                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3728                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3729                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3730                 Color_Diffuse[3] = 0.0f;
3731                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3732                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3733                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3734                 LightColor[3] = 0.0f;
3735                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3736                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3737                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3738                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3739                 Color_Specular[3] = 0.0f;
3740                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3741                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3742                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3743
3744                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3745                 {
3746                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3747                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3748                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3749                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3750                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3751                 }
3752                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3753                 {
3754                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3755                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3756                 }
3757                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3758                 {
3759                         // nothing of this needed
3760                 }
3761                 else
3762                 {
3763                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3764                 }
3765
3766                 for (x = startx;x < endx;x++)
3767                 {
3768                         z = buffer_z[x];
3769                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3770                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3771                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3772                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3773                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3774                         {
3775                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3776                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3777                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3778                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3779                         }
3780                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3781                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3782                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3783                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3784                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3785                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3786                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3787                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3788
3789                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3790                         {
3791                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3792                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3793                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3794                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3795
3796                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3797                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3798                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3799                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3800
3801                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3802                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3803                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3804                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3805
3806                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3807                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3808                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3809                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3810
3811                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3812                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3813
3814                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3815                                 {
3816                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3817                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3818                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3819                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3820                                 }
3821                         }
3822                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3823                         {
3824                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3825                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3826                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3827                                 {
3828                                         float f = 1.0f / 256.0f;
3829                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3830                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3831                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3832                                 }
3833                         }
3834                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3835                         {
3836                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3837                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3838                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3839                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3840
3841                                 LightColor[0] = 1.0;
3842                                 LightColor[1] = 1.0;
3843                                 LightColor[2] = 1.0;
3844                         }
3845                         else
3846                         {
3847                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3848                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3849                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3850                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3851                         }
3852
3853                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3854
3855                         if(thread->shader_exactspecularmath)
3856                         {
3857                                 // reflect lightnormal at surfacenormal, take the negative of that
3858                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3859                                 float f;
3860                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3861                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3862                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3863                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3864
3865                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3866                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3867                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3868                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3869                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3870
3871                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3872                         }
3873                         else
3874                         {
3875                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3876                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3877                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3878                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3879
3880                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3881                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3882                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3883                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3884
3885                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3886                         }
3887
3888                         specular = pow(specular, SpecularPower * glosstex[3]);
3889                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3890                         {
3891                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3892                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3893                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3894                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3895                         }
3896                         else
3897                         {
3898                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3899                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3900                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3901                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3902                         }
3903
3904                         buffer_FragColorbgra8[x*4+0] = d[0];
3905                         buffer_FragColorbgra8[x*4+1] = d[1];
3906                         buffer_FragColorbgra8[x*4+2] = d[2];
3907                         buffer_FragColorbgra8[x*4+3] = d[3];
3908                 }
3909         }
3910         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3911         {
3912                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3913                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3914                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3915                 Color_Diffuse[3] = 0.0f;
3916                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3917                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3918                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3919                 LightColor[3] = 0.0f;
3920                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3921
3922                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3923                 {
3924                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3925                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3926                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3927                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3928                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3929                 }
3930                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3931                 {
3932                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3933                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3934                 }
3935                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3936                 {
3937                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3938                 }
3939                 else
3940                 {
3941                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3942                 }
3943
3944                 for (x = startx;x < endx;x++)
3945                 {
3946                         z = buffer_z[x];
3947                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3948                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3949                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3950                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3951                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3952                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3953                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3954                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3955
3956                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3957                         {
3958                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3959                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3960                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3961                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3962
3963                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3964                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3965                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3966                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3967
3968                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3969                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3970                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3971                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3972
3973                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3974                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3975                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3976                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3977
3978                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3979                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3980
3981                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3982                                 {
3983                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3984                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3985                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3986                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3987                                 }
3988                         }
3989                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3990                         {
3991                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3992                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3993                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3994                                 {
3995                                         float f = 1.0f / 256.0f;
3996                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3997                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3998                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3999                                 }
4000                         }
4001                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
4002                         {
4003                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4004                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4005                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4006                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4007
4008                                 LightColor[0] = 1.0;
4009                                 LightColor[1] = 1.0;
4010                                 LightColor[2] = 1.0;
4011                         }
4012                         else
4013                         {
4014                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4015                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4016                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4017                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4018                         }
4019
4020                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4021                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4022                         {
4023                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4024                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4025                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4026                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4027                         }
4028                         else
4029                         {
4030                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4031                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4032                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4033                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4034                         }
4035                         buffer_FragColorbgra8[x*4+0] = d[0];
4036                         buffer_FragColorbgra8[x*4+1] = d[1];
4037                         buffer_FragColorbgra8[x*4+2] = d[2];
4038                         buffer_FragColorbgra8[x*4+3] = d[3];
4039                 }
4040         }
4041         else
4042         {
4043                 for (x = startx;x < endx;x++)
4044                 {
4045                         z = buffer_z[x];
4046                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4047                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4048                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4049                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4050
4051                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4052                         {
4053                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4054                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4055                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4056                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4057                         }
4058                         else
4059                         {
4060                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4061                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4062                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4063                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4064                         }
4065                         buffer_FragColorbgra8[x*4+0] = d[0];
4066                         buffer_FragColorbgra8[x*4+1] = d[1];
4067                         buffer_FragColorbgra8[x*4+2] = d[2];
4068                         buffer_FragColorbgra8[x*4+3] = d[3];
4069                 }
4070         }
4071         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4072 }
4073
4074
4075
4076 void DPSOFTRAST_VertexShader_LightSource(void)
4077 {
4078         int i;
4079         int numvertices = dpsoftrast.numvertices;
4080         float LightPosition[4];
4081         float LightVector[4];
4082         float LightVectorModelSpace[4];
4083         float EyePosition[4];
4084         float EyeVectorModelSpace[4];
4085         float EyeVector[4];
4086         float position[4];
4087         float svector[4];
4088         float tvector[4];
4089         float normal[4];
4090         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4091         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4092         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4093         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4094         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4095         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4096         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4097         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4098         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4099         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4100         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4101         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4102         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4103         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4104         for (i = 0;i < numvertices;i++)
4105         {
4106                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4107                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4108                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4109                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4110                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4111                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4112                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4113                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4114                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4115                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4116                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4117                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4118                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4119                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4120                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4121                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4122                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4123                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4124                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4125                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4126                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4127                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4128                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4129                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4130                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4131                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4132                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4133                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4134                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4135                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4136                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4137                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4138         }
4139         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4140         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4141 }
4142
4143 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4144 {
4145 #ifdef SSE_POSSIBLE
4146         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4147         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4148         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4149         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4150         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4151         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4152         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4153         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4154         int x, startx = span->startx, endx = span->endx;
4155         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4156         float CubeVectordata[4];
4157         float CubeVectorslope[4];
4158         float LightVectordata[4];
4159         float LightVectorslope[4];
4160         float EyeVectordata[4];
4161         float EyeVectorslope[4];
4162         float z;
4163         float diffusetex[4];
4164         float glosstex[4];
4165         float surfacenormal[4];
4166         float lightnormal[4];
4167         float eyenormal[4];
4168         float specularnormal[4];
4169         float diffuse;
4170         float specular;
4171         float SpecularPower;
4172         float CubeVector[4];
4173         float attenuation;
4174         int d[4];
4175         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4176         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4177         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4178         Color_Glow[3] = 0.0f;
4179         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4180         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4181         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4182         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4183         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4184         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4185         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4186         Color_Diffuse[3] = 0.0f;
4187         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4188         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4189         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4190         Color_Specular[3] = 0.0f;
4191         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4192         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4193         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4194         Color_Pants[3] = 0.0f;
4195         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4196         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4197         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4198         Color_Shirt[3] = 0.0f;
4199         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4200         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4201         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4202         LightColor[3] = 0.0f;
4203         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4204         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4205         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4206         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4207         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4208         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4209         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4210         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4211         {
4212                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4213                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4214         }
4215         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4216                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4217         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4218         {
4219                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4220                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4221                 for (x = startx;x < endx;x++)
4222                 {
4223                         z = buffer_z[x];
4224                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4225                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4226                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4227                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4228                         if (attenuation < 0.01f)
4229                                 continue;
4230                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4231                         {
4232                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4233                                 if (attenuation < 0.01f)
4234                                         continue;
4235                         }
4236
4237                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4238                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4239                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4240                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4241                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4242                         {
4243                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4244                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4245                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4246                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4247                         }
4248                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4249                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4250                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4251                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4252                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4253                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4254                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4255                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4256
4257                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4258                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4259                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4260                         DPSOFTRAST_Vector3Normalize(lightnormal);
4261
4262                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4263
4264                         if(thread->shader_exactspecularmath)
4265                         {
4266                                 // reflect lightnormal at surfacenormal, take the negative of that
4267                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4268                                 float f;
4269                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4270                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4271                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4272                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4273
4274                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4275                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4276                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4277                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4278                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4279
4280                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4281                         }
4282                         else
4283                         {
4284                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4285                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4286                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4287                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4288
4289                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4290                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4291                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4292                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4293
4294                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4295                         }
4296                         specular = pow(specular, SpecularPower * glosstex[3]);
4297
4298                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4299                         {
4300                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4301                                 attenuation *= (1.0f / 255.0f);
4302                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4303                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4304                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4305                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4306                         }
4307                         else
4308                         {
4309                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4310                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4311                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4312                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4313                         }
4314                         buffer_FragColorbgra8[x*4+0] = d[0];
4315                         buffer_FragColorbgra8[x*4+1] = d[1];
4316                         buffer_FragColorbgra8[x*4+2] = d[2];
4317                         buffer_FragColorbgra8[x*4+3] = d[3];
4318                 }
4319         }
4320         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4321         {
4322                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4323                 for (x = startx;x < endx;x++)
4324                 {
4325                         z = buffer_z[x];
4326                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4327                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4328                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4329                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4330                         if (attenuation < 0.01f)
4331                                 continue;
4332                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4333                         {
4334                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4335                                 if (attenuation < 0.01f)
4336                                         continue;
4337                         }
4338
4339                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4340                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4341                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4342                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4343                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4344                         {
4345                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4346                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4347                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4348                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4349                         }
4350                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4351                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4352                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4353                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4354
4355                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4356                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4357                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4358                         DPSOFTRAST_Vector3Normalize(lightnormal);
4359
4360                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4361                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4362                         {
4363                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4364                                 attenuation *= (1.0f / 255.0f);
4365                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4366                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4367                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4368                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4369                         }
4370                         else
4371                         {
4372                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4373                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4374                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4375                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4376                         }
4377                         buffer_FragColorbgra8[x*4+0] = d[0];
4378                         buffer_FragColorbgra8[x*4+1] = d[1];
4379                         buffer_FragColorbgra8[x*4+2] = d[2];
4380                         buffer_FragColorbgra8[x*4+3] = d[3];
4381                 }
4382         }
4383         else
4384         {
4385                 for (x = startx;x < endx;x++)
4386                 {
4387                         z = buffer_z[x];
4388                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4389                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4390                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4391                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4392                         if (attenuation < 0.01f)
4393                                 continue;
4394                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4395                         {
4396                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4397                                 if (attenuation < 0.01f)
4398                                         continue;
4399                         }
4400
4401                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4402                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4403                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4404                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4405                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4406                         {
4407                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4408                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4409                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4410                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4411                         }
4412                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4413                         {
4414                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4415                                 attenuation *= (1.0f / 255.0f);
4416                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4417                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4418                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4419                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4420                         }
4421                         else
4422                         {
4423                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4424                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4425                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4426                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4427                         }
4428                         buffer_FragColorbgra8[x*4+0] = d[0];
4429                         buffer_FragColorbgra8[x*4+1] = d[1];
4430                         buffer_FragColorbgra8[x*4+2] = d[2];
4431                         buffer_FragColorbgra8[x*4+3] = d[3];
4432                 }
4433         }
4434         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4435 #endif
4436 }
4437
4438
4439
4440 void DPSOFTRAST_VertexShader_Refraction(void)
4441 {
4442         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4443         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4444         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4445 }
4446
4447 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4448 {
4449         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4450         float z;
4451         int x, startx = span->startx, endx = span->endx;
4452
4453         // texture reads
4454         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4455         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4456
4457         // varyings
4458         float ModelViewProjectionPositiondata[4];
4459         float ModelViewProjectionPositionslope[4];
4460
4461         // uniforms
4462         float ScreenScaleRefractReflect[2];
4463         float ScreenCenterRefractReflect[2];
4464         float DistortScaleRefractReflect[2];
4465         float RefractColor[4];
4466
4467         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4468         if(!texture) return;
4469
4470         // read textures
4471         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4472         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4473
4474         // read varyings
4475         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4476
4477         // read uniforms
4478         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4479         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4480         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4481         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4482         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4483         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4484         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4485         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4486         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4487         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4488
4489         // do stuff
4490         for (x = startx;x < endx;x++)
4491         {
4492                 float SafeScreenTexCoord[2];
4493                 float ScreenTexCoord[2];
4494                 float v[3];
4495                 float iw;
4496                 unsigned char c[4];
4497
4498                 z = buffer_z[x];
4499
4500                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4501                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4502
4503                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4504                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4505                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4506
4507                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4508                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4509                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4510                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4511                 DPSOFTRAST_Vector3Normalize(v);
4512                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4513                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4514
4515                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4516                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4517
4518                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4519                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4520                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4521                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4522         }
4523
4524         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4525 }
4526
4527
4528
4529 void DPSOFTRAST_VertexShader_Water(void)
4530 {
4531         int i;
4532         int numvertices = dpsoftrast.numvertices;
4533         float EyePosition[4];
4534         float EyeVectorModelSpace[4];
4535         float EyeVector[4];
4536         float position[4];
4537         float svector[4];
4538         float tvector[4];
4539         float normal[4];
4540         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4541         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4542         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4543         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4544         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4545         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4546         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4547         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4548         for (i = 0;i < numvertices;i++)
4549         {
4550                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4551                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4552                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4553                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4554                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4555                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4556                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4557                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4558                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4559                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4560                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4561                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4562                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4563                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4564                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4565                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4566                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4567                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4568                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4569                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4570                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4571                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4572         }
4573         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4574         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4575         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4576 }
4577
4578
4579 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4580 {
4581         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4582         float z;
4583         int x, startx = span->startx, endx = span->endx;
4584
4585         // texture reads
4586         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4587         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4588
4589         // varyings
4590         float ModelViewProjectionPositiondata[4];
4591         float ModelViewProjectionPositionslope[4];
4592         float EyeVectordata[4];
4593         float EyeVectorslope[4];
4594
4595         // uniforms
4596         float ScreenScaleRefractReflect[2];
4597         float ScreenCenterRefractReflect[2];
4598         float DistortScaleRefractReflect[2];
4599         float RefractColor[4];
4600         float ReflectColor[4];
4601         float ReflectFactor;
4602         float ReflectOffset;
4603
4604         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4605         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4606         if(!texture_refraction || !texture_reflection) return;
4607
4608         // read textures
4609         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4610         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4611
4612         // read varyings
4613         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4614         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4615
4616         // read uniforms
4617         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4618         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4619         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4620         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4621         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4622         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4623         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4624         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4625         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4626         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4627         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4628         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4629         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4630         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4631         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4632         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4633         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4634         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4635         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4636         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4637         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4638         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4639
4640         // do stuff
4641         for (x = startx;x < endx;x++)
4642         {
4643                 float SafeScreenTexCoord[4];
4644                 float ScreenTexCoord[4];
4645                 float v[3];
4646                 float iw;
4647                 unsigned char c1[4];
4648                 unsigned char c2[4];
4649                 float Fresnel;
4650
4651                 z = buffer_z[x];
4652
4653                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4654                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4655
4656                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4657                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4658                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4659                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4660                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4661
4662                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4663                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4664                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4665                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4666                 DPSOFTRAST_Vector3Normalize(v);
4667                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4668                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4669                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4670                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4671
4672                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4673                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4674                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4675                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4676                 DPSOFTRAST_Vector3Normalize(v);
4677                 Fresnel = 1.0f - v[2];
4678                 Fresnel = min(1.0f, Fresnel);
4679                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4680
4681                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4682                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4683                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4684                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4685
4686                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4687                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4688                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4689                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4690         }
4691
4692         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4693 }
4694
4695
4696
4697 void DPSOFTRAST_VertexShader_ShowDepth(void)
4698 {
4699         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4700 }
4701
4702 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4703 {
4704         // TODO: IMPLEMENT
4705         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4706         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4707         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4708         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4709         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4710 }
4711
4712
4713
4714 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4715 {
4716         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4717 }
4718
4719 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4720 {
4721         // TODO: IMPLEMENT
4722         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4723         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4724         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4725         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4726         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4727 }
4728
4729
4730
4731 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4732 {
4733         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4734 }
4735
4736 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4737 {
4738         // TODO: IMPLEMENT
4739         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4740         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4741         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4742         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4743         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4744 }
4745
4746
4747
4748 typedef struct DPSOFTRAST_ShaderModeInfo_s
4749 {
4750         int lodarrayindex;
4751         void (*Vertex)(void);
4752         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4753         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4754         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4755 }
4756 DPSOFTRAST_ShaderModeInfo;
4757
4758 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4759 {
4760         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4761         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4762         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4763         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4764         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4765         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4766         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4767         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4768         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4769         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4770         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4771         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4772         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4773         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4774         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4775         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4776 };
4777
4778 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4779 {
4780         int x;
4781         int startx;
4782         int endx;
4783         unsigned int *depthpixel;
4784         int depth;
4785         int depthslope;
4786         unsigned int d;
4787         unsigned char *pixelmask;
4788         DPSOFTRAST_State_Triangle *triangle;
4789         triangle = &thread->triangles[span->triangle];
4790         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4791         startx = span->startx;
4792         endx = span->endx;
4793         depth = span->depthbase;
4794         depthslope = span->depthslope;
4795         pixelmask = thread->pixelmaskarray;
4796         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4797         {
4798                 switch(thread->fb_depthfunc)
4799                 {
4800                 default:
4801                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4802                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4803                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4804                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4805                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4806                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4807                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4808                 }
4809                 while (startx < endx && !pixelmask[startx])
4810                         startx++;
4811                 while (endx > startx && !pixelmask[endx-1])
4812                         endx--;
4813         }
4814         else
4815         {
4816                 // no depth testing means we're just dealing with color...
4817                 memset(pixelmask + startx, 1, endx - startx);
4818         }
4819         span->pixelmask = pixelmask;
4820         span->startx = startx;
4821         span->endx = endx;
4822 }
4823
4824 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4825 {
4826         int x, d, depth, depthslope, startx, endx;
4827         const unsigned char *pixelmask;
4828         unsigned int *depthpixel;
4829         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4830         {
4831                 depth = span->depthbase;
4832                 depthslope = span->depthslope;
4833                 pixelmask = span->pixelmask;
4834                 startx = span->startx;
4835                 endx = span->endx;
4836                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4837                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4838                         if (pixelmask[x])
4839                                 depthpixel[x] = d;
4840         }
4841 }
4842
4843 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4844 {
4845         int i;
4846         DPSOFTRAST_State_Triangle *triangle;
4847         DPSOFTRAST_State_Span *span;
4848         for (i = 0; i < thread->numspans; i++)
4849         {
4850                 span = &thread->spans[i];
4851                 triangle = &thread->triangles[span->triangle];
4852                 DPSOFTRAST_Draw_DepthTest(thread, span);
4853                 if (span->startx >= span->endx)
4854                         continue;
4855                 // run pixel shader if appropriate
4856                 // do this before running depthmask code, to allow the pixelshader
4857                 // to clear pixelmask values for alpha testing
4858                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4859                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4860                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4861         }
4862         thread->numspans = 0;
4863 }
4864
4865 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4866
4867 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4868 {
4869 #ifdef SSE_POSSIBLE
4870         int cullface = thread->cullface;
4871         int minx, maxx, miny, maxy;
4872         int miny1, maxy1, miny2, maxy2;
4873         __m128i fbmin, fbmax;
4874         __m128 viewportcenter, viewportscale;
4875         int firstvertex = command->firstvertex;
4876         int numvertices = command->numvertices;
4877         int numtriangles = command->numtriangles;
4878         const int *element3i = command->element3i;
4879         const unsigned short *element3s = command->element3s;
4880         int clipped = command->clipped;
4881         int i;
4882         int j;
4883         int k;
4884         int y;
4885         int e[3];
4886         __m128i screeny;
4887         int starty, endy, bandy;
4888         int numpoints;
4889         int clipcase;
4890         float clipdist[4];
4891         float clip0origin, clip0slope;
4892         int clip0dir;
4893         __m128 triangleedge1, triangleedge2, trianglenormal;
4894         __m128 clipfrac[3];
4895         __m128 screen[4];
4896         DPSOFTRAST_State_Triangle *triangle;
4897         DPSOFTRAST_Texture *texture;
4898         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4899         miny = thread->fb_scissor[1];
4900         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4901         miny1 = bound(miny, thread->miny1, maxy);
4902         maxy1 = bound(miny, thread->maxy1, maxy);
4903         miny2 = bound(miny, thread->miny2, maxy);
4904         maxy2 = bound(miny, thread->maxy2, maxy);
4905         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4906         {
4907                 if (!ATOMIC_DECREMENT(command->refcount))
4908                 {
4909                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4910                                 MM_FREE(command->arrays);
4911                 }
4912                 return;
4913         }
4914         minx = thread->fb_scissor[0];
4915         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4916         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4917         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4918         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4919         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4920         screen[3] = _mm_setzero_ps();
4921         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4922         for (i = 0;i < numtriangles;i++)
4923         {
4924                 const float *screencoord4f = command->arrays;
4925                 const float *arrays = screencoord4f + numvertices*4;
4926
4927                 // generate the 3 edges of this triangle
4928                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4929                 if (element3s)
4930                 {
4931                         e[0] = element3s[i*3+0] - firstvertex;
4932                         e[1] = element3s[i*3+1] - firstvertex;
4933                         e[2] = element3s[i*3+2] - firstvertex;
4934                 }
4935                 else if (element3i)
4936                 {
4937                         e[0] = element3i[i*3+0] - firstvertex;
4938                         e[1] = element3i[i*3+1] - firstvertex;
4939                         e[2] = element3i[i*3+2] - firstvertex;
4940                 }
4941                 else
4942                 {
4943                         e[0] = i*3+0;
4944                         e[1] = i*3+1;
4945                         e[2] = i*3+2;
4946                 }
4947
4948 #define SKIPBACKFACE \
4949                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4950                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4951                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4952                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4953                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4954                 switch(cullface) \
4955                 { \
4956                 case GL_BACK: \
4957                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4958                                 continue; \
4959                         break; \
4960                 case GL_FRONT: \
4961                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4962                                 continue; \
4963                         break; \
4964                 }
4965
4966 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4967                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4968                         { \
4969                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4970                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4971                         }
4972 #define CLIPPEDVERTEXCOPY(k,p1) \
4973                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4974
4975 #define GENATTRIBCOPY(attrib, p1) \
4976                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4977 #define GENATTRIBLERP(attrib, p1, p2) \
4978                 { \
4979                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4980                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4981                 }
4982 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4983                 switch(clipcase) \
4984                 { \
4985                 default: \
4986                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4987                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4988                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4989                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4990                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4991                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4992                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4993                 }
4994
4995                 if (! clipped)
4996                         goto notclipped;
4997
4998                 // calculate distance from nearplane
4999                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
5000                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
5001                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
5002                 if (clipdist[0] >= 0.0f)
5003                 {
5004                         if (clipdist[1] >= 0.0f)
5005                         {
5006                                 if (clipdist[2] >= 0.0f)
5007                                 {
5008                                 notclipped:
5009                                         // triangle is entirely in front of nearplane
5010                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5011                                         SKIPBACKFACE;
5012                                         numpoints = 3;
5013                                         clipcase = 0;
5014                                 }
5015                                 else
5016                                 {
5017                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5018                                         SKIPBACKFACE;
5019                                         numpoints = 4;
5020                                         clipcase = 1;
5021                                 }
5022                         }
5023                         else
5024                         {
5025                                 if (clipdist[2] >= 0.0f)
5026                                 {
5027                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5028                                         SKIPBACKFACE;
5029                                         numpoints = 4;
5030                                         clipcase = 2;
5031                                 }
5032                                 else
5033                                 {
5034                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5035                                         SKIPBACKFACE;
5036                                         numpoints = 3;
5037                                         clipcase = 3;
5038                                 }
5039                         }
5040                 }
5041                 else if (clipdist[1] >= 0.0f)
5042                 {
5043                         if (clipdist[2] >= 0.0f)
5044                         {
5045                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5046                                 SKIPBACKFACE;
5047                                 numpoints = 4;
5048                                 clipcase = 4;
5049                         }
5050                         else
5051                         {
5052                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5053                                 SKIPBACKFACE;
5054                                 numpoints = 3;
5055                                 clipcase = 5;
5056                         }
5057                 }
5058                 else if (clipdist[2] >= 0.0f)
5059                 {
5060                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5061                         SKIPBACKFACE;
5062                         numpoints = 3;
5063                         clipcase = 6;
5064                 }
5065                 else continue; // triangle is entirely behind nearplane
5066
5067                 {
5068                         // calculate integer y coords for triangle points
5069                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5070                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5071                                         screenmin = _mm_min_epi16(screeni, screenir),
5072                                         screenmax = _mm_max_epi16(screeni, screenir);
5073                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5074                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5075                         screenmin = _mm_max_epi16(screenmin, fbmin);
5076                         screenmax = _mm_min_epi16(screenmax, fbmax);
5077                         // skip offscreen triangles
5078                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5079                                 continue;
5080                         starty = _mm_extract_epi16(screenmin, 1);
5081                         endy = _mm_extract_epi16(screenmax, 1)+1;
5082                         if (starty >= maxy1 && endy <= miny2)
5083                                 continue;
5084                         screeny = _mm_srai_epi32(screeni, 16);
5085                 }
5086
5087                 triangle = &thread->triangles[thread->numtriangles];
5088
5089                 // calculate attribute plans for triangle data...
5090                 // okay, this triangle is going to produce spans, we'd better project
5091                 // the interpolants now (this is what gives perspective texturing),
5092                 // this consists of simply multiplying all arrays by the W coord
5093                 // (which is basically 1/Z), which will be undone per-pixel
5094                 // (multiplying by Z again) to get the perspective-correct array
5095                 // values
5096                 {
5097                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5098                         __m128 mipedgescale, mipdensity;
5099                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5100                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5101                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5102                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5103                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5104                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5105                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5106                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5107                         attribedge1 = _mm_sub_ss(w0, w1);
5108                         attribedge2 = _mm_sub_ss(w2, w1);
5109                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5110                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5111                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5112                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5113                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5114                         _mm_store_ss(&triangle->w[0], attribxslope);
5115                         _mm_store_ss(&triangle->w[1], attribyslope);
5116                         _mm_store_ss(&triangle->w[2], attriborigin);
5117                         
5118                         clip0origin = 0;
5119                         clip0slope = 0;
5120                         clip0dir = 0;
5121                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5122                         {
5123                                 float cliporigin, clipxslope, clipyslope;
5124                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5125                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5126                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5127                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5128                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5129                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5130                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5131                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5132                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5133                                 if(clipxslope != 0)
5134                                 {
5135                                         clip0origin = -cliporigin/clipxslope;
5136                                         clip0slope = -clipyslope/clipxslope;
5137                                         clip0dir = clipxslope > 0 ? 1 : -1;
5138                                 }
5139                                 else if(clipyslope > 0)
5140                                 {
5141                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5142                                         clip0slope = dpsoftrast.fb_width;
5143                                         clip0dir = -1;
5144                                 }
5145                                 else if(clipyslope < 0)
5146                                 {
5147                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5148                                         clip0slope = -dpsoftrast.fb_width;
5149                                         clip0dir = -1;
5150                                 }
5151                                 else if(clip0origin < 0) continue;
5152                         }
5153
5154                         mipedgescale = _mm_setzero_ps();
5155                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5156                         {
5157                                 __m128 attrib0, attrib1, attrib2;
5158                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5159                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5160                                         break;
5161                                 arrays += numvertices*4;
5162                                 GENATTRIBS(attrib0, attrib1, attrib2);
5163                                 attriborigin = _mm_mul_ps(attrib1, w1);
5164                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5165                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5166                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5167                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5168                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5169                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5170                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5171                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5172                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5173                                 {
5174                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5175                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5176                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5177                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5178                                 }
5179                         }
5180
5181                         memset(triangle->mip, 0, sizeof(triangle->mip));
5182                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5183                         {
5184                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5185                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5186                                         break;
5187                                 texture = thread->texbound[texunit];
5188                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5189                                 {
5190                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5191                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5192                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5193                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5194                                         // this will be multiplied in the texturing routine by the texture resolution
5195                                         y = _mm_cvtss_si32(mipdensity);
5196                                         if (y > 0)
5197                                         {
5198                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5199                                                 if (y > texture->mipmaps - 1)
5200                                                         y = texture->mipmaps - 1;
5201                                                 triangle->mip[texunit] = y;
5202                                         }
5203                                 }
5204                         }
5205                 }
5206         
5207                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5208                 for (; y < bandy;)
5209                 {
5210                         __m128 xcoords, xslope;
5211                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5212                         int yccmask = _mm_movemask_epi8(ycc);
5213                         int edge0p, edge0n, edge1p, edge1n;
5214                         int nexty;
5215                         float w, wslope;
5216                         float clip0;
5217                         if (numpoints == 4)
5218                         {
5219                                 switch(yccmask)
5220                                 {
5221                                 default:
5222                                 case 0xFFFF: /*0000*/ y = endy; continue;
5223                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5224                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5225                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5226                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5227                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5228                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5229                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5230                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5231                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5232                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5233                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5234                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5235                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5236                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5237                                 case 0x0000: /*1111*/ y++; continue;
5238                                 }
5239                         }
5240                         else
5241                         {
5242                                 switch(yccmask)
5243                                 {
5244                                 default:
5245                                 case 0xFFFF: /*000*/ y = endy; continue;
5246                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5247                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5248                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5249                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5250                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5251                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5252                                 case 0x0000: /*111*/ y++; continue;
5253                                 }
5254                         }
5255                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5256                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5257                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5258                         nexty = _mm_extract_epi16(ycc, 0);
5259                         if (nexty >= bandy) nexty = bandy-1;
5260                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5261                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5262                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5263                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5264                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5265                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5266                         {
5267                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5268                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5269                         }
5270                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5271                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5272                         {
5273                                 int startx, endx, offset;
5274                                 startx = _mm_cvtss_si32(xcoords);
5275                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5276                                 if (startx < minx) startx = minx;
5277                                 if (endx > maxx) endx = maxx;
5278                                 if (startx >= endx) continue;
5279
5280                                 if (clip0dir)
5281                                 {
5282                                         if (clip0dir > 0)
5283                                         {
5284                                                 if (startx < clip0) 
5285                                                 {
5286                                                         if(endx <= clip0) continue;
5287                                                         startx = (int)clip0;
5288                                                 }
5289                                         }
5290                                         else if (endx > clip0) 
5291                                         {
5292                                                 if(startx >= clip0) continue;
5293                                                 endx = (int)clip0;
5294                                         }
5295                                 }
5296                                                 
5297                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5298                                 {
5299                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5300                                         span->triangle = thread->numtriangles;
5301                                         span->x = offset;
5302                                         span->y = y;
5303                                         span->startx = 0;
5304                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5305                                         if (span->startx >= span->endx)
5306                                                 continue;
5307                                         wslope = triangle->w[0];
5308                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5309                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5310                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5311                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5312                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5313                                 }
5314                         }
5315                 }
5316
5317                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5318                 {
5319                         DPSOFTRAST_Draw_ProcessSpans(thread);
5320                         thread->numtriangles = 0;
5321                 }
5322         }
5323
5324         if (!ATOMIC_DECREMENT(command->refcount))
5325         {
5326                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5327                         MM_FREE(command->arrays);
5328         }
5329
5330         if (thread->numspans > 0 || thread->numtriangles > 0)
5331         {
5332                 DPSOFTRAST_Draw_ProcessSpans(thread);
5333                 thread->numtriangles = 0;
5334         }
5335 #endif
5336 }
5337
5338 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5339 {
5340         int i;
5341         int j;
5342         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5343         int datasize = 2*numvertices*sizeof(float[4]);
5344         DPSOFTRAST_Command_Draw *command;
5345         unsigned char *data;
5346         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5347         {
5348                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5349                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5350                         break;
5351                 datasize += numvertices*sizeof(float[4]);
5352         }
5353         if (element3s)
5354                 datasize += numtriangles*sizeof(unsigned short[3]);
5355         else if (element3i)
5356                 datasize += numtriangles*sizeof(int[3]);
5357         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5358         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5359         {
5360                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5361                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5362         }
5363         else
5364         {
5365                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5366                 data = (unsigned char *)command + commandsize;
5367         }
5368         command->firstvertex = firstvertex;
5369         command->numvertices = numvertices;
5370         command->numtriangles = numtriangles;
5371         command->arrays = (float *)data;
5372         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5373         dpsoftrast.firstvertex = firstvertex;
5374         dpsoftrast.numvertices = numvertices;
5375         dpsoftrast.screencoord4f = (float *)data;
5376         data += numvertices*sizeof(float[4]);
5377         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5378         data += numvertices*sizeof(float[4]);
5379         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5380         {
5381                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5382                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5383                         break;
5384                 dpsoftrast.post_array4f[j] = (float *)data;
5385                 data += numvertices*sizeof(float[4]);
5386         }
5387         command->element3i = NULL;
5388         command->element3s = NULL;
5389         if (element3s)
5390         {
5391                 command->element3s = (unsigned short *)data;
5392                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5393         }
5394         else if (element3i)
5395         {
5396                 command->element3i = (int *)data;
5397                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5398         }
5399         return command;
5400 }
5401
5402 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5403 {
5404         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5405         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5406         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5407         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5408         if (command->starty >= command->endy)
5409         {
5410                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5411                         MM_FREE(command->arrays);
5412                 DPSOFTRAST_UndoCommand(command->commandsize);
5413                 return;
5414         }
5415         command->clipped = dpsoftrast.drawclipped;
5416         command->refcount = dpsoftrast.numthreads;
5417
5418         if (dpsoftrast.usethreads)
5419         {
5420                 int i;
5421                 DPSOFTRAST_Draw_SyncCommands();
5422                 for (i = 0; i < dpsoftrast.numthreads; i++)
5423                 {
5424                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5425                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5426                                 Thread_CondSignal(thread->drawcond);
5427                 }
5428         }
5429         else
5430         {
5431                 DPSOFTRAST_Draw_FlushThreads();
5432         }
5433 }
5434
5435 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5436 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5437 {
5438         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5439 }
5440 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5441 {
5442         DPSOFTRAST_Command_SetRenderTargets *command;
5443         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5444                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5445                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5446                 DPSOFTRAST_Flush();
5447         dpsoftrast.fb_width = width;
5448         dpsoftrast.fb_height = height;
5449         dpsoftrast.fb_depthpixels = depthpixels;
5450         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5451         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5452         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5453         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5454         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5455         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5456         command->width = width;
5457         command->height = height;
5458 }
5459  
5460 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5461 {
5462         int commandoffset = thread->commandoffset;
5463         while (commandoffset != endoffset)
5464         {
5465                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5466                 switch (command->opcode)
5467                 {
5468 #define INTERPCOMMAND(name) \
5469                 case DPSOFTRAST_OPCODE_##name : \
5470                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5471                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5472                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5473                                 commandoffset = 0; \
5474                         break;
5475                 INTERPCOMMAND(Viewport)
5476                 INTERPCOMMAND(ClearColor)
5477                 INTERPCOMMAND(ClearDepth)
5478                 INTERPCOMMAND(ColorMask)
5479                 INTERPCOMMAND(DepthTest)
5480                 INTERPCOMMAND(ScissorTest)
5481                 INTERPCOMMAND(Scissor)
5482                 INTERPCOMMAND(BlendFunc)
5483                 INTERPCOMMAND(BlendSubtract)
5484                 INTERPCOMMAND(DepthMask)
5485                 INTERPCOMMAND(DepthFunc)
5486                 INTERPCOMMAND(DepthRange)
5487                 INTERPCOMMAND(PolygonOffset)
5488                 INTERPCOMMAND(CullFace)
5489                 INTERPCOMMAND(AlphaTest)
5490                 INTERPCOMMAND(AlphaFunc)
5491                 INTERPCOMMAND(SetTexture)
5492                 INTERPCOMMAND(SetShader)
5493                 INTERPCOMMAND(Uniform4f)
5494                 INTERPCOMMAND(UniformMatrix4f)
5495                 INTERPCOMMAND(Uniform1i)
5496                 INTERPCOMMAND(SetRenderTargets)
5497                 INTERPCOMMAND(ClipPlane)
5498
5499                 case DPSOFTRAST_OPCODE_Draw:
5500                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5501                         commandoffset += command->commandsize;
5502                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5503                                 commandoffset = 0;
5504                         thread->commandoffset = commandoffset;
5505                         break;
5506
5507                 case DPSOFTRAST_OPCODE_Reset:
5508                         commandoffset = 0;
5509                         break;
5510                 }
5511         }
5512         thread->commandoffset = commandoffset;
5513 }
5514
5515 static int DPSOFTRAST_Draw_Thread(void *data)
5516 {
5517         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5518         while(thread->index >= 0)
5519         {
5520                 if (thread->commandoffset != dpsoftrast.drawcommand)
5521                 {
5522                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5523                 }
5524                 else 
5525                 {
5526                         Thread_LockMutex(thread->drawmutex);
5527                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5528                         {
5529                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5530                                 thread->starving = true;
5531                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5532                                 thread->starving = false;
5533                         }
5534                         Thread_UnlockMutex(thread->drawmutex);
5535                 }
5536         }   
5537         return 0;
5538 }
5539
5540 static void DPSOFTRAST_Draw_FlushThreads(void)
5541 {
5542         DPSOFTRAST_State_Thread *thread;
5543         int i;
5544         DPSOFTRAST_Draw_SyncCommands();
5545         if (dpsoftrast.usethreads) 
5546         {
5547                 for (i = 0; i < dpsoftrast.numthreads; i++)
5548                 {
5549                         thread = &dpsoftrast.threads[i];
5550                         if (thread->commandoffset != dpsoftrast.drawcommand)
5551                         {
5552                                 Thread_LockMutex(thread->drawmutex);
5553                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5554                                         Thread_CondSignal(thread->drawcond);
5555                                 Thread_UnlockMutex(thread->drawmutex);
5556                         }
5557                 }
5558                 for (i = 0; i < dpsoftrast.numthreads; i++)
5559                 {
5560                         thread = &dpsoftrast.threads[i];
5561                         if (thread->commandoffset != dpsoftrast.drawcommand)
5562                         {
5563                                 Thread_LockMutex(thread->drawmutex);
5564                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5565                                 {
5566                                         thread->waiting = true;
5567                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5568                                         thread->waiting = false;
5569                                 }
5570                                 Thread_UnlockMutex(thread->drawmutex);
5571                         }
5572                 }
5573         }
5574         else
5575         {
5576                 for (i = 0; i < dpsoftrast.numthreads; i++)
5577                 {
5578                         thread = &dpsoftrast.threads[i];
5579                         if (thread->commandoffset != dpsoftrast.drawcommand)
5580                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5581                 }
5582         }
5583         dpsoftrast.commandpool.usedcommands = 0;
5584 }
5585
5586 void DPSOFTRAST_Flush(void)
5587 {
5588         DPSOFTRAST_Draw_FlushThreads();
5589 }
5590
5591 void DPSOFTRAST_Finish(void)
5592 {
5593         DPSOFTRAST_Flush();
5594 }
5595
5596 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5597 {
5598         int i;
5599         union
5600         {
5601                 int i;
5602                 unsigned char b[4];
5603         }
5604         u;
5605         u.i = 1;
5606         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5607         dpsoftrast.bigendian = u.b[3];
5608         dpsoftrast.fb_width = width;
5609         dpsoftrast.fb_height = height;
5610         dpsoftrast.fb_depthpixels = depthpixels;
5611         dpsoftrast.fb_colorpixels[0] = colorpixels;
5612         dpsoftrast.fb_colorpixels[1] = NULL;
5613         dpsoftrast.fb_colorpixels[1] = NULL;
5614         dpsoftrast.fb_colorpixels[1] = NULL;
5615         dpsoftrast.viewport[0] = 0;
5616         dpsoftrast.viewport[1] = 0;
5617         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5618         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5619         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5620         dpsoftrast.texture_firstfree = 1;
5621         dpsoftrast.texture_end = 1;
5622         dpsoftrast.texture_max = 0;
5623         dpsoftrast.color[0] = 1;
5624         dpsoftrast.color[1] = 1;
5625         dpsoftrast.color[2] = 1;
5626         dpsoftrast.color[3] = 1;
5627         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5628         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5629         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5630         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5631         for (i = 0; i < dpsoftrast.numthreads; i++)
5632         {
5633                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5634                 thread->index = i;
5635                 thread->cullface = GL_BACK;
5636         thread->colormask[0] = 1; 
5637                 thread->colormask[1] = 1;
5638                 thread->colormask[2] = 1;
5639                 thread->colormask[3] = 1;
5640                 thread->blendfunc[0] = GL_ONE;
5641                 thread->blendfunc[1] = GL_ZERO;
5642                 thread->depthmask = true;
5643                 thread->depthtest = true;
5644                 thread->depthfunc = GL_LEQUAL;
5645                 thread->scissortest = false;
5646                 thread->alphatest = false;
5647                 thread->alphafunc = GL_GREATER;
5648                 thread->alphavalue = 0.5f;
5649                 thread->viewport[0] = 0;
5650                 thread->viewport[1] = 0;
5651                 thread->viewport[2] = dpsoftrast.fb_width;
5652                 thread->viewport[3] = dpsoftrast.fb_height;
5653                 thread->scissor[0] = 0;
5654                 thread->scissor[1] = 0;
5655                 thread->scissor[2] = dpsoftrast.fb_width;
5656                 thread->scissor[3] = dpsoftrast.fb_height;
5657                 thread->depthrange[0] = 0;
5658                 thread->depthrange[1] = 1;
5659                 thread->polygonoffset[0] = 0;
5660                 thread->polygonoffset[1] = 0;
5661                 thread->clipplane[0] = 0;
5662                 thread->clipplane[1] = 0;
5663                 thread->clipplane[2] = 0;
5664                 thread->clipplane[3] = 1;
5665         
5666                 thread->numspans = 0;
5667                 thread->numtriangles = 0;
5668                 thread->commandoffset = 0;
5669                 thread->waiting = false;
5670                 thread->starving = false;
5671            
5672                 thread->validate = -1;
5673                 DPSOFTRAST_Validate(thread, -1);
5674  
5675                 if (dpsoftrast.usethreads)
5676                 {
5677                         thread->waitcond = Thread_CreateCond();
5678                         thread->drawcond = Thread_CreateCond();
5679                         thread->drawmutex = Thread_CreateMutex();
5680                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5681                 }
5682         }
5683         return 0;
5684 }
5685
5686 void DPSOFTRAST_Shutdown(void)
5687 {
5688         int i;
5689         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5690         {
5691                 DPSOFTRAST_State_Thread *thread;
5692                 for (i = 0; i < dpsoftrast.numthreads; i++)
5693                 {
5694                         thread = &dpsoftrast.threads[i];
5695                         Thread_LockMutex(thread->drawmutex);
5696                         thread->index = -1;
5697                         Thread_CondSignal(thread->drawcond);
5698                         Thread_UnlockMutex(thread->drawmutex);
5699                         Thread_WaitThread(thread->thread, 0);
5700                         Thread_DestroyCond(thread->waitcond);
5701                         Thread_DestroyCond(thread->drawcond);
5702                         Thread_DestroyMutex(thread->drawmutex);
5703                 }
5704         }
5705         for (i = 0;i < dpsoftrast.texture_end;i++)
5706                 if (dpsoftrast.texture[i].bytes)
5707                         MM_FREE(dpsoftrast.texture[i].bytes);
5708         if (dpsoftrast.texture)
5709                 free(dpsoftrast.texture);
5710         if (dpsoftrast.threads)
5711                 MM_FREE(dpsoftrast.threads);
5712         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5713 }
5714