]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
break the net connection AFTER calling ClientDisconnect; this fixes issues with clien...
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
77         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
78 #endif
79
80 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
81
82 static void *MM_CALLOC(size_t nmemb, size_t size)
83 {
84         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
85         if (ptr != NULL) memset(ptr, 0, nmemb*size);
86         return ptr;
87 }
88
89 #define MM_FREE _mm_free
90 #else
91 #define MM_MALLOC(size) malloc(size)
92 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
93 #define MM_FREE free
94 #endif
95
96 typedef enum DPSOFTRAST_ARRAY_e
97 {
98         DPSOFTRAST_ARRAY_POSITION,
99         DPSOFTRAST_ARRAY_COLOR,
100         DPSOFTRAST_ARRAY_TEXCOORD0,
101         DPSOFTRAST_ARRAY_TEXCOORD1,
102         DPSOFTRAST_ARRAY_TEXCOORD2,
103         DPSOFTRAST_ARRAY_TEXCOORD3,
104         DPSOFTRAST_ARRAY_TEXCOORD4,
105         DPSOFTRAST_ARRAY_TEXCOORD5,
106         DPSOFTRAST_ARRAY_TEXCOORD6,
107         DPSOFTRAST_ARRAY_TEXCOORD7,
108         DPSOFTRAST_ARRAY_TOTAL
109 }
110 DPSOFTRAST_ARRAY;
111
112 typedef struct DPSOFTRAST_Texture_s
113 {
114         int flags;
115         int width;
116         int height;
117         int depth;
118         int sides;
119         DPSOFTRAST_TEXTURE_FILTER filter;
120         int mipmaps;
121         int size;
122         ATOMIC_COUNTER binds;
123         unsigned char *bytes;
124         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
125 }
126 DPSOFTRAST_Texture;
127
128 #define COMMAND_SIZE ALIGN_SIZE
129 #define COMMAND_ALIGN(var) ALIGN(var)
130
131 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
132 {
133         unsigned char opcode;
134         unsigned short commandsize;
135 }
136 DPSOFTRAST_Command);
137
138 enum { DPSOFTRAST_OPCODE_Reset = 0 };
139
140 #define DEFCOMMAND(opcodeval, name, fields) \
141         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
142         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
143         { \
144                 unsigned char opcode; \
145                 unsigned short commandsize; \
146                 fields \
147         } DPSOFTRAST_Command_##name );
148
149 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
150 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
151
152 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
153 {
154         int freecommand;
155         int usedcommands;
156         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
157 }
158 DPSOFTRAST_State_Command_Pool);
159
160 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
161 {
162         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
163         float w[3];
164         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
165 }
166 DPSOFTRAST_State_Triangle);
167
168 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
169         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
170         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
171                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
172                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
173 }
174 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
175         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
176         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
177         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
178         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
179         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
180         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
181         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
182         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
183 }
184                                         
185 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
186
187 typedef ALIGN(struct DPSOFTRAST_State_Span_s
188 {
189         int triangle; // triangle this span was generated by
190         int x; // framebuffer x coord
191         int y; // framebuffer y coord
192         int startx; // usable range (according to pixelmask)
193         int endx; // usable range (according to pixelmask)
194         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
195         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
196         int depthslope; // depthbuffer value pixel delta
197 }
198 DPSOFTRAST_State_Span);
199
200 #define DPSOFTRAST_DRAW_MAXSPANS 1024
201 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
202 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
203
204 #define DPSOFTRAST_VALIDATE_FB 1
205 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
206 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
207 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
208
209 typedef enum DPSOFTRAST_BLENDMODE_e
210 {
211         DPSOFTRAST_BLENDMODE_OPAQUE,
212         DPSOFTRAST_BLENDMODE_ALPHA,
213         DPSOFTRAST_BLENDMODE_ADDALPHA,
214         DPSOFTRAST_BLENDMODE_ADD,
215         DPSOFTRAST_BLENDMODE_INVMOD,
216         DPSOFTRAST_BLENDMODE_MUL,
217         DPSOFTRAST_BLENDMODE_MUL2,
218         DPSOFTRAST_BLENDMODE_SUBALPHA,
219         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
220         DPSOFTRAST_BLENDMODE_INVADD,
221         DPSOFTRAST_BLENDMODE_TOTAL
222 }
223 DPSOFTRAST_BLENDMODE;
224
225 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
226 {
227         void *thread;
228         int index;
229         
230         int cullface;
231         int colormask[4];
232         int blendfunc[2];
233         int blendsubtract;
234         int depthmask;
235         int depthtest;
236         int depthfunc;
237         int scissortest;
238         int alphatest;
239         int alphafunc;
240         float alphavalue;
241         int viewport[4];
242         int scissor[4];
243         float depthrange[2];
244         float polygonoffset[2];
245         float clipplane[4];
246         ALIGN(float fb_clipplane[4]);
247
248         int shader_mode;
249         int shader_permutation;
250         int shader_exactspecularmath;
251
252         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
253         
254         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
255         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
256
257         // DPSOFTRAST_VALIDATE_ flags
258         int validate;
259
260         // derived values (DPSOFTRAST_VALIDATE_FB)
261         int fb_colormask;
262         int fb_scissor[4];
263         ALIGN(float fb_viewportcenter[4]);
264         ALIGN(float fb_viewportscale[4]);
265
266         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
267         int fb_depthfunc;
268
269         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
270         int fb_blendmode;
271
272         // band boundaries
273         int miny1;
274         int maxy1;
275         int miny2;
276         int maxy2;
277
278         ATOMIC(volatile int commandoffset);
279
280         volatile bool waiting;
281         volatile bool starving;
282         void *waitcond;
283         void *drawcond;
284         void *drawmutex;
285
286         int numspans;
287         int numtriangles;
288         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
289         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
290         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
291 }
292 DPSOFTRAST_State_Thread);
293
294 typedef ATOMIC(struct DPSOFTRAST_State_s
295 {
296         int fb_width;
297         int fb_height;
298         unsigned int *fb_depthpixels;
299         unsigned int *fb_colorpixels[4];
300
301         int viewport[4];
302         ALIGN(float fb_viewportcenter[4]);
303         ALIGN(float fb_viewportscale[4]);
304
305         float color[4];
306         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
307         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
308
309         const float *pointer_vertex3f;
310         const float *pointer_color4f;
311         const unsigned char *pointer_color4ub;
312         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
313         int stride_vertex;
314         int stride_color;
315         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
316         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
317         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
318
319         int firstvertex;
320         int numvertices;
321         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
322         float *screencoord4f;
323         int drawstarty;
324         int drawendy;
325         int drawclipped;
326         
327         int shader_mode;
328         int shader_permutation;
329         int shader_exactspecularmath;
330
331         int texture_max;
332         int texture_end;
333         int texture_firstfree;
334         DPSOFTRAST_Texture *texture;
335
336         int bigendian;
337
338         // error reporting
339         const char *errorstring;
340
341         bool usethreads;
342         int interlace;
343         int numthreads;
344         DPSOFTRAST_State_Thread *threads;
345
346         ATOMIC(volatile int drawcommand);
347
348         DPSOFTRAST_State_Command_Pool commandpool;
349 }
350 DPSOFTRAST_State);
351
352 DPSOFTRAST_State dpsoftrast;
353
354 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
355 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
356 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
357 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
358
359 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
360 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
361
362 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
363 {
364         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
365         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
366         fb_viewportcenter[3] = 0.5f;
367         fb_viewportcenter[0] = 0.0f;
368         fb_viewportscale[1] = 0.5f * viewport[2];
369         fb_viewportscale[2] = -0.5f * viewport[3];
370         fb_viewportscale[3] = 0.5f;
371         fb_viewportscale[0] = 1.0f;
372 }
373
374 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
375 {
376         if (dpsoftrast.interlace)
377         {
378                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
379                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
380                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
381                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
382         }
383         else
384         {
385                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
386                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
387         }
388 }
389
390 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
391 {
392         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
393         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
394         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
395         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
396         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
397 }
398
399 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
400 {
401         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
402         // and viewport projection values
403         int x1, x2;
404         int y1, y2;
405         x1 = thread->scissor[0];
406         x2 = thread->scissor[0] + thread->scissor[2];
407         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
408         y2 = dpsoftrast.fb_height - thread->scissor[1];
409         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
410         if (x1 < 0) x1 = 0;
411         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
412         if (y1 < 0) y1 = 0;
413         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
414         thread->fb_scissor[0] = x1;
415         thread->fb_scissor[1] = y1;
416         thread->fb_scissor[2] = x2 - x1;
417         thread->fb_scissor[3] = y2 - y1;
418
419         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
420         DPSOFTRAST_RecalcClipPlane(thread);
421         DPSOFTRAST_RecalcThread(thread);
422 }
423
424 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
425 {
426         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
427 }
428
429 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
430 {
431         if (thread->blendsubtract)
432         {
433                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
434                 {
435                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
436                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
437                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
438                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
439                 }
440         }
441         else
442         {       
443                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
444                 {
445                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
446                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
447                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
448                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
449                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
450                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
451                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
452                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
453                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
454                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
455                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
456                 }
457         }
458 }
459
460 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
461
462 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
463 {
464         mask &= thread->validate;
465         if (!mask)
466                 return;
467         if (mask & DPSOFTRAST_VALIDATE_FB)
468         {
469                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
470                 DPSOFTRAST_RecalcFB(thread);
471         }
472         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
473         {
474                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
475                 DPSOFTRAST_RecalcDepthFunc(thread);
476         }
477         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
478         {
479                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
480                 DPSOFTRAST_RecalcBlendFunc(thread);
481         }
482 }
483
484 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
485 {
486         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
487                 return &dpsoftrast.texture[index];
488         return NULL;
489 }
490
491 static void DPSOFTRAST_Texture_Grow(void)
492 {
493         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
494         DPSOFTRAST_State_Thread *thread;
495         int i;
496         int j;
497         DPSOFTRAST_Flush();
498         // expand texture array as needed
499         if (dpsoftrast.texture_max < 1024)
500                 dpsoftrast.texture_max = 1024;
501         else
502                 dpsoftrast.texture_max *= 2;
503         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
504         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
505                 if (dpsoftrast.texbound[i])
506                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
507         for (j = 0; j < dpsoftrast.numthreads; j++)
508         {
509                 thread = &dpsoftrast.threads[j];
510                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
511                         if (thread->texbound[i])
512                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
513         }
514 }
515
516 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
517 {
518         int w;
519         int h;
520         int d;
521         int size;
522         int s;
523         int texnum;
524         int mipmaps;
525         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
526         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
527         DPSOFTRAST_Texture *texture;
528         if (width*height*depth < 1)
529         {
530                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
531                 return 0;
532         }
533         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
534         {
535                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
536                 return 0;
537         }
538         switch(texformat)
539         {
540         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
541         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
542         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
543                 break;
544         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
545                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
546                 {
547                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
548                         return 0;
549                 }
550                 if (depth != 1)
551                 {
552                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
553                         return 0;
554                 }
555                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
556                 {
557                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
558                         return 0;
559                 }
560                 break;
561         }
562         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
563         {
564                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
565                 return 0;
566         }
567         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
568         {
569                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
570                 return 0;
571         }
572         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
573         {
574                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
575                 return 0;
576         }
577         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
578         {
579                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
580                 return 0;
581         }
582         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
583         {
584                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
585                 return 0;
586         }
587         // find first empty slot in texture array
588         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
589                 if (!dpsoftrast.texture[texnum].bytes)
590                         break;
591         dpsoftrast.texture_firstfree = texnum + 1;
592         if (dpsoftrast.texture_max <= texnum)
593                 DPSOFTRAST_Texture_Grow();
594         if (dpsoftrast.texture_end <= texnum)
595                 dpsoftrast.texture_end = texnum + 1;
596         texture = &dpsoftrast.texture[texnum];
597         memset(texture, 0, sizeof(*texture));
598         texture->flags = flags;
599         texture->width = width;
600         texture->height = height;
601         texture->depth = depth;
602         texture->sides = sides;
603         texture->binds = 0;
604         w = width;
605         h = height;
606         d = depth;
607         size = 0;
608         mipmaps = 0;
609         w = width;
610         h = height;
611         d = depth;
612         for (;;)
613         {
614                 s = w * h * d * sides * 4;
615                 texture->mipmap[mipmaps][0] = size;
616                 texture->mipmap[mipmaps][1] = s;
617                 texture->mipmap[mipmaps][2] = w;
618                 texture->mipmap[mipmaps][3] = h;
619                 texture->mipmap[mipmaps][4] = d;
620                 size += s;
621                 mipmaps++;
622                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
623                         break;
624                 if (w > 1) w >>= 1;
625                 if (h > 1) h >>= 1;
626                 if (d > 1) d >>= 1;
627         }
628         texture->mipmaps = mipmaps;
629         texture->size = size;
630
631         // allocate the pixels now
632         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
633
634         return texnum;
635 }
636 void DPSOFTRAST_Texture_Free(int index)
637 {
638         DPSOFTRAST_Texture *texture;
639         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
640         if (texture->binds)
641                 DPSOFTRAST_Flush();
642         if (texture->bytes)
643                 MM_FREE(texture->bytes);
644         texture->bytes = NULL;
645         memset(texture, 0, sizeof(*texture));
646         // adjust the free range and used range
647         if (dpsoftrast.texture_firstfree > index)
648                 dpsoftrast.texture_firstfree = index;
649         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
650                 dpsoftrast.texture_end--;
651 }
652 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
653 {
654         int i, x, y, z, w, layer0, layer1, row0, row1;
655         unsigned char *o, *i0, *i1, *i2, *i3;
656         DPSOFTRAST_Texture *texture;
657         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
658         if (texture->mipmaps <= 1)
659                 return;
660         for (i = 1;i < texture->mipmaps;i++)
661         {
662                 for (z = 0;z < texture->mipmap[i][4];z++)
663                 {
664                         layer0 = z*2;
665                         layer1 = z*2+1;
666                         if (layer1 >= texture->mipmap[i-1][4])
667                                 layer1 = texture->mipmap[i-1][4]-1;
668                         for (y = 0;y < texture->mipmap[i][3];y++)
669                         {
670                                 row0 = y*2;
671                                 row1 = y*2+1;
672                                 if (row1 >= texture->mipmap[i-1][3])
673                                         row1 = texture->mipmap[i-1][3]-1;
674                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
675                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
676                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
677                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
678                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
679                                 w = texture->mipmap[i][2];
680                                 if (layer1 > layer0)
681                                 {
682                                         if (texture->mipmap[i-1][2] > 1)
683                                         {
684                                                 // average 3D texture
685                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
686                                                 {
687                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
688                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
689                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
690                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
691                                                 }
692                                         }
693                                         else
694                                         {
695                                                 // average 3D mipmap with parent width == 1
696                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
697                                                 {
698                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
699                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
700                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
701                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
702                                                 }
703                                         }
704                                 }
705                                 else
706                                 {
707                                         if (texture->mipmap[i-1][2] > 1)
708                                         {
709                                                 // average 2D texture (common case)
710                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
711                                                 {
712                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
713                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
714                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
715                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
716                                                 }
717                                         }
718                                         else
719                                         {
720                                                 // 2D texture with parent width == 1
721                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
722                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
723                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
724                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
725                                         }
726                                 }
727                         }
728                 }
729         }
730 }
731 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
732 {
733         DPSOFTRAST_Texture *texture;
734         unsigned char *dst;
735         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
736         if (texture->binds)
737                 DPSOFTRAST_Flush();
738         if (pixels)
739         {
740                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
741                 while (blockheight > 0)
742                 {
743                         memcpy(dst, pixels, blockwidth * 4);
744                         pixels += blockwidth * 4;
745                         dst += texture->mipmap[0][2] * 4;
746                         blockheight--;
747                 }
748         }
749         DPSOFTRAST_Texture_CalculateMipmaps(index);
750 }
751 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
752 {
753         DPSOFTRAST_Texture *texture;
754         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
755         if (texture->binds)
756                 DPSOFTRAST_Flush();
757         if (pixels)
758                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
759         DPSOFTRAST_Texture_CalculateMipmaps(index);
760 }
761 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
762 {
763         DPSOFTRAST_Texture *texture;
764         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
765         return texture->mipmap[mip][2];
766 }
767 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
768 {
769         DPSOFTRAST_Texture *texture;
770         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
771         return texture->mipmap[mip][3];
772 }
773 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
774 {
775         DPSOFTRAST_Texture *texture;
776         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
777         return texture->mipmap[mip][4];
778 }
779 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
780 {
781         DPSOFTRAST_Texture *texture;
782         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
783         if (texture->binds)
784                 DPSOFTRAST_Flush();
785         return texture->bytes + texture->mipmap[mip][0];
786 }
787 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
788 {
789         DPSOFTRAST_Texture *texture;
790         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
791         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
792         {
793                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
794                 return;
795         }
796         if (texture->binds)
797                 DPSOFTRAST_Flush();
798         texture->filter = filter;
799 }
800
801 static void DPSOFTRAST_Draw_FlushThreads(void);
802
803 static void DPSOFTRAST_Draw_SyncCommands(void)
804 {
805         if(dpsoftrast.usethreads) MEMORY_BARRIER;
806         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
807 }
808
809 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
810 {
811         DPSOFTRAST_State_Thread *thread;
812         int i;
813         int freecommand = dpsoftrast.commandpool.freecommand;
814         int usedcommands = dpsoftrast.commandpool.usedcommands;
815         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
816                 return;
817         DPSOFTRAST_Draw_SyncCommands();
818         for(;;)
819         {
820                 int waitindex = -1;
821                 int commandoffset;
822                 usedcommands = 0;
823                 for (i = 0; i < dpsoftrast.numthreads; i++)
824                 {
825                         thread = &dpsoftrast.threads[i]; 
826                         commandoffset = freecommand - thread->commandoffset;
827                         if (commandoffset < 0)
828                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
829                         if (commandoffset > usedcommands)
830                         {
831                                 waitindex = i;
832                                 usedcommands = commandoffset;
833                         }
834                 }
835                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
836                         break;
837                 thread = &dpsoftrast.threads[waitindex];
838                 Thread_LockMutex(thread->drawmutex);
839                 if (thread->commandoffset != dpsoftrast.drawcommand)
840                 {
841                         thread->waiting = true;
842                         if (thread->starving) Thread_CondSignal(thread->drawcond);
843                         Thread_CondWait(thread->waitcond, thread->drawmutex);
844                         thread->waiting = false;
845                 }
846                 Thread_UnlockMutex(thread->drawmutex);
847         }
848         dpsoftrast.commandpool.usedcommands = usedcommands;
849 }
850
851 #define DPSOFTRAST_ALIGNCOMMAND(size) \
852         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
853 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
854         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
855
856 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
857 {
858         DPSOFTRAST_Command *command;
859         int freecommand = dpsoftrast.commandpool.freecommand;
860         int usedcommands = dpsoftrast.commandpool.usedcommands;
861         int extra = sizeof(DPSOFTRAST_Command);
862         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
863                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
864         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
865         {
866                 if (dpsoftrast.usethreads)
867                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
868                 else
869                         DPSOFTRAST_Draw_FlushThreads();
870                 freecommand = dpsoftrast.commandpool.freecommand;
871                 usedcommands = dpsoftrast.commandpool.usedcommands;
872         }
873         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
874         {
875                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
876                 command->opcode = DPSOFTRAST_OPCODE_Reset;
877                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
878                 freecommand = 0;
879         }
880         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
881         command->opcode = opcode;
882         command->commandsize = size;
883         freecommand += size;
884         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
885                 freecommand = 0;
886         dpsoftrast.commandpool.freecommand = freecommand;
887         dpsoftrast.commandpool.usedcommands = usedcommands + size;
888         return command;
889 }
890
891 static void DPSOFTRAST_UndoCommand(int size)
892 {
893         int freecommand = dpsoftrast.commandpool.freecommand;
894         int usedcommands = dpsoftrast.commandpool.usedcommands;
895         freecommand -= size;
896         if (freecommand < 0)
897                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
898         usedcommands -= size;
899         dpsoftrast.commandpool.freecommand = freecommand;
900         dpsoftrast.commandpool.usedcommands = usedcommands;
901 }
902                 
903 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
904 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
905 {
906         thread->viewport[0] = command->x;
907         thread->viewport[1] = command->y;
908         thread->viewport[2] = command->width;
909         thread->viewport[3] = command->height;
910         thread->validate |= DPSOFTRAST_VALIDATE_FB;
911 }
912 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
913 {
914         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
915         command->x = x;
916         command->y = y;
917         command->width = width;
918         command->height = height;
919
920         dpsoftrast.viewport[0] = x;
921         dpsoftrast.viewport[1] = y;
922         dpsoftrast.viewport[2] = width;
923         dpsoftrast.viewport[3] = height;
924         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
925 }
926
927 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
928 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
929 {
930         int i, x1, y1, x2, y2, w, h, x, y;
931         int miny1, maxy1, miny2, maxy2;
932         int bandy;
933         unsigned int *p;
934         unsigned int c;
935         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
936         miny1 = thread->miny1;
937         maxy1 = thread->maxy1;
938         miny2 = thread->miny2;
939         maxy2 = thread->maxy2;
940         x1 = thread->fb_scissor[0];
941         y1 = thread->fb_scissor[1];
942         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
943         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
944         if (y1 < miny1) y1 = miny1;
945         if (y2 > maxy2) y2 = maxy2;
946         w = x2 - x1;
947         h = y2 - y1;
948         if (w < 1 || h < 1)
949                 return;
950         // FIXME: honor fb_colormask?
951         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
952         for (i = 0;i < 4;i++)
953         {
954                 if (!dpsoftrast.fb_colorpixels[i])
955                         continue;
956                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
957                 for (;y < bandy;y++)
958                 {
959                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
960                         for (x = x1;x < x2;x++)
961                                 p[x] = c;
962                 }
963         }
964 }
965 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
966 {
967         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
968         command->r = r;
969         command->g = g;
970         command->b = b;
971         command->a = a;
972 }
973
974 DEFCOMMAND(3, ClearDepth, float depth;)
975 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
976 {
977         int x1, y1, x2, y2, w, h, x, y;
978         int miny1, maxy1, miny2, maxy2;
979         int bandy;
980         unsigned int *p;
981         unsigned int c;
982         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
983         miny1 = thread->miny1;
984         maxy1 = thread->maxy1;
985         miny2 = thread->miny2;
986         maxy2 = thread->maxy2;
987         x1 = thread->fb_scissor[0];
988         y1 = thread->fb_scissor[1];
989         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
990         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
991         if (y1 < miny1) y1 = miny1;
992         if (y2 > maxy2) y2 = maxy2;
993         w = x2 - x1;
994         h = y2 - y1;
995         if (w < 1 || h < 1)
996                 return;
997         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
998         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
999         for (;y < bandy;y++)
1000         {
1001                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1002                 for (x = x1;x < x2;x++)
1003                         p[x] = c;
1004         }
1005 }
1006 void DPSOFTRAST_ClearDepth(float d)
1007 {
1008         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1009         command->depth = d;
1010 }
1011
1012 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1013 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1014 {
1015         thread->colormask[0] = command->r != 0;
1016         thread->colormask[1] = command->g != 0;
1017         thread->colormask[2] = command->b != 0;
1018         thread->colormask[3] = command->a != 0;
1019         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1020 }
1021 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1022 {
1023         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1024         command->r = r;
1025         command->g = g;
1026         command->b = b;
1027         command->a = a;
1028 }
1029
1030 DEFCOMMAND(5, DepthTest, int enable;)
1031 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1032 {
1033         thread->depthtest = command->enable;
1034         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1035 }
1036 void DPSOFTRAST_DepthTest(int enable)
1037 {
1038         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1039         command->enable = enable;
1040 }
1041
1042 DEFCOMMAND(6, ScissorTest, int enable;)
1043 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1044 {
1045         thread->scissortest = command->enable;
1046         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1047 }
1048 void DPSOFTRAST_ScissorTest(int enable)
1049 {
1050         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1051         command->enable = enable;
1052 }
1053
1054 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1055 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1056 {
1057         thread->scissor[0] = command->x;
1058         thread->scissor[1] = command->y;
1059         thread->scissor[2] = command->width;
1060         thread->scissor[3] = command->height;
1061         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1062 }
1063 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1064 {
1065         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1066         command->x = x;
1067         command->y = y;
1068         command->width = width;
1069         command->height = height;
1070 }
1071
1072 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1073 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1074 {
1075         thread->blendfunc[0] = command->sfactor;
1076         thread->blendfunc[1] = command->dfactor;
1077         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1078 }
1079 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1080 {
1081         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1082         command->sfactor = sfactor;
1083         command->dfactor = dfactor;
1084 }
1085
1086 DEFCOMMAND(9, BlendSubtract, int enable;)
1087 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1088 {
1089         thread->blendsubtract = command->enable;
1090         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1091 }
1092 void DPSOFTRAST_BlendSubtract(int enable)
1093 {
1094         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1095         command->enable = enable;
1096 }
1097
1098 DEFCOMMAND(10, DepthMask, int enable;)
1099 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1100 {
1101         thread->depthmask = command->enable;
1102 }
1103 void DPSOFTRAST_DepthMask(int enable)
1104 {
1105         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1106         command->enable = enable;
1107 }
1108
1109 DEFCOMMAND(11, DepthFunc, int func;)
1110 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1111 {
1112         thread->depthfunc = command->func;
1113 }
1114 void DPSOFTRAST_DepthFunc(int func)
1115 {
1116         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1117         command->func = func;
1118 }
1119
1120 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1121 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1122 {
1123         thread->depthrange[0] = command->nearval;
1124         thread->depthrange[1] = command->farval;
1125 }
1126 void DPSOFTRAST_DepthRange(float nearval, float farval)
1127 {
1128         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1129         command->nearval = nearval;
1130         command->farval = farval;
1131 }
1132
1133 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1134 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1135 {
1136         thread->polygonoffset[0] = command->alongnormal;
1137         thread->polygonoffset[1] = command->intoview;
1138 }
1139 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1140 {
1141         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1142         command->alongnormal = alongnormal;
1143         command->intoview = intoview;
1144 }
1145
1146 DEFCOMMAND(14, CullFace, int mode;)
1147 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1148 {
1149         thread->cullface = command->mode;
1150 }
1151 void DPSOFTRAST_CullFace(int mode)
1152 {
1153         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1154         command->mode = mode;
1155 }
1156
1157 DEFCOMMAND(15, AlphaTest, int enable;)
1158 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1159 {
1160         thread->alphatest = command->enable;
1161 }
1162 void DPSOFTRAST_AlphaTest(int enable)
1163 {
1164         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1165         command->enable = enable;
1166 }
1167
1168 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1169 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1170 {
1171         thread->alphafunc = command->func;
1172         thread->alphavalue = command->ref;
1173 }
1174 void DPSOFTRAST_AlphaFunc(int func, float ref)
1175 {
1176         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1177         command->func = func;
1178         command->ref = ref;
1179 }
1180
1181 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1182 {
1183         dpsoftrast.color[0] = r;
1184         dpsoftrast.color[1] = g;
1185         dpsoftrast.color[2] = b;
1186         dpsoftrast.color[3] = a;
1187 }
1188
1189 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1190 {
1191         int outstride = blockwidth * 4;
1192         int instride = dpsoftrast.fb_width * 4;
1193         int bx1 = blockx;
1194         int by1 = blocky;
1195         int bx2 = blockx + blockwidth;
1196         int by2 = blocky + blockheight;
1197         int bw;
1198         int x;
1199         int y;
1200         unsigned char *inpixels;
1201         unsigned char *b;
1202         unsigned char *o;
1203         DPSOFTRAST_Flush();
1204         if (bx1 < 0) bx1 = 0;
1205         if (by1 < 0) by1 = 0;
1206         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1207         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1208         bw = bx2 - bx1;
1209         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1210         if (dpsoftrast.bigendian)
1211         {
1212                 for (y = by1;y < by2;y++)
1213                 {
1214                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1215                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1216                         for (x = bx1;x < bx2;x++)
1217                         {
1218                                 o[0] = b[3];
1219                                 o[1] = b[2];
1220                                 o[2] = b[1];
1221                                 o[3] = b[0];
1222                                 o += 4;
1223                                 b += 4;
1224                         }
1225                 }
1226         }
1227         else
1228         {
1229                 for (y = by1;y < by2;y++)
1230                 {
1231                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1232                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1233                         memcpy(o, b, bw*4);
1234                 }
1235         }
1236
1237 }
1238 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1239 {
1240         int tx1 = tx;
1241         int ty1 = ty;
1242         int tx2 = tx + width;
1243         int ty2 = ty + height;
1244         int sx1 = sx;
1245         int sy1 = sy;
1246         int sx2 = sx + width;
1247         int sy2 = sy + height;
1248         int swidth;
1249         int sheight;
1250         int twidth;
1251         int theight;
1252         int sw;
1253         int sh;
1254         int tw;
1255         int th;
1256         int y;
1257         unsigned int *spixels;
1258         unsigned int *tpixels;
1259         DPSOFTRAST_Texture *texture;
1260         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1261         if (mip < 0 || mip >= texture->mipmaps) return;
1262         DPSOFTRAST_Flush();
1263         spixels = dpsoftrast.fb_colorpixels[0];
1264         swidth = dpsoftrast.fb_width;
1265         sheight = dpsoftrast.fb_height;
1266         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1267         twidth = texture->mipmap[mip][2];
1268         theight = texture->mipmap[mip][3];
1269         if (tx1 < 0) tx1 = 0;
1270         if (ty1 < 0) ty1 = 0;
1271         if (tx2 > twidth) tx2 = twidth;
1272         if (ty2 > theight) ty2 = theight;
1273         if (sx1 < 0) sx1 = 0;
1274         if (sy1 < 0) sy1 = 0;
1275         if (sx2 > swidth) sx2 = swidth;
1276         if (sy2 > sheight) sy2 = sheight;
1277         tw = tx2 - tx1;
1278         th = ty2 - ty1;
1279         sw = sx2 - sx1;
1280         sh = sy2 - sy1;
1281         if (tw > sw) tw = sw;
1282         if (th > sh) th = sh;
1283         if (tw < 1 || th < 1)
1284                 return;
1285         sy1 = sheight - 1 - sy1;
1286         for (y = 0;y < th;y++)
1287                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1288         if (texture->mipmaps > 1)
1289                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1290 }
1291
1292 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1293 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1294 {
1295         if (thread->texbound[command->unitnum])
1296                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1297         thread->texbound[command->unitnum] = command->texture;
1298 }
1299 void DPSOFTRAST_SetTexture(int unitnum, int index)
1300 {
1301         DPSOFTRAST_Command_SetTexture *command;
1302         DPSOFTRAST_Texture *texture;
1303         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1304         {
1305                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1306                 return;
1307         }
1308         texture = DPSOFTRAST_Texture_GetByIndex(index);
1309         if (index && !texture)
1310         {
1311                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1312                 return;
1313         }
1314
1315         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1316         command->unitnum = unitnum;
1317         command->texture = texture;
1318
1319         dpsoftrast.texbound[unitnum] = texture;
1320         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1321 }
1322
1323 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1324 {
1325         dpsoftrast.pointer_vertex3f = vertex3f;
1326         dpsoftrast.stride_vertex = stride;
1327 }
1328 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1329 {
1330         dpsoftrast.pointer_color4f = color4f;
1331         dpsoftrast.pointer_color4ub = NULL;
1332         dpsoftrast.stride_color = stride;
1333 }
1334 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1335 {
1336         dpsoftrast.pointer_color4f = NULL;
1337         dpsoftrast.pointer_color4ub = color4ub;
1338         dpsoftrast.stride_color = stride;
1339 }
1340 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1341 {
1342         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1343         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1344         dpsoftrast.stride_texcoord[unitnum] = stride;
1345 }
1346
1347 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1348 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1349 {
1350         thread->shader_mode = command->mode;
1351         thread->shader_permutation = command->permutation;
1352         thread->shader_exactspecularmath = command->exactspecularmath;
1353 }
1354 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1355 {
1356         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1357         command->mode = mode;
1358         command->permutation = permutation;
1359         command->exactspecularmath = exactspecularmath;
1360
1361         dpsoftrast.shader_mode = mode;
1362         dpsoftrast.shader_permutation = permutation;
1363         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1364 }
1365
1366 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1367 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1368 {
1369         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1370 }
1371 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1372 {
1373         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1374         command->index = index;
1375         command->val[0] = v0;
1376         command->val[1] = v1;
1377         command->val[2] = v2;
1378         command->val[3] = v3;
1379
1380         dpsoftrast.uniform4f[index*4+0] = v0;
1381         dpsoftrast.uniform4f[index*4+1] = v1;
1382         dpsoftrast.uniform4f[index*4+2] = v2;
1383         dpsoftrast.uniform4f[index*4+3] = v3;
1384 }
1385 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1386 {
1387         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1388         command->index = index;
1389         memcpy(command->val, v, sizeof(command->val));
1390
1391         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1392 }
1393
1394 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1395 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1396 {
1397         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1398 }
1399 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1400 {
1401 #ifdef SSE_POSSIBLE
1402         int i, index;
1403         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1404         {
1405                 __m128 m0, m1, m2, m3;
1406                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1407                 command->index = (DPSOFTRAST_UNIFORM)index;
1408                 if (((size_t)v)&(ALIGN_SIZE-1))
1409                 {
1410                         m0 = _mm_loadu_ps(v);
1411                         m1 = _mm_loadu_ps(v+4);
1412                         m2 = _mm_loadu_ps(v+8);
1413                         m3 = _mm_loadu_ps(v+12);
1414                 }
1415                 else
1416                 {
1417                         m0 = _mm_load_ps(v);
1418                         m1 = _mm_load_ps(v+4);
1419                         m2 = _mm_load_ps(v+8);
1420                         m3 = _mm_load_ps(v+12);
1421                 }
1422                 if (transpose)
1423                 {
1424                         __m128 t0, t1, t2, t3;
1425                         t0 = _mm_unpacklo_ps(m0, m1);
1426                         t1 = _mm_unpacklo_ps(m2, m3);
1427                         t2 = _mm_unpackhi_ps(m0, m1);
1428                         t3 = _mm_unpackhi_ps(m2, m3);
1429                         m0 = _mm_movelh_ps(t0, t1);
1430                         m1 = _mm_movehl_ps(t1, t0);
1431                         m2 = _mm_movelh_ps(t2, t3);
1432                         m3 = _mm_movehl_ps(t3, t2);                     
1433                 }
1434                 _mm_store_ps(command->val, m0);
1435                 _mm_store_ps(command->val+4, m1);
1436                 _mm_store_ps(command->val+8, m2);
1437                 _mm_store_ps(command->val+12, m3);
1438                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1439                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1440                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1441                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1442         }
1443 #endif
1444 }
1445
1446 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1447 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1448 {
1449         thread->uniform1i[command->index] = command->val;
1450 }
1451 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1452 {
1453         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1454         command->index = index;
1455         command->val = i0;
1456
1457         dpsoftrast.uniform1i[command->index] = i0;
1458 }
1459
1460 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1461 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1462 {
1463         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1464         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1465 }
1466 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1467 {
1468         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1469         command->clipplane[0] = x;
1470         command->clipplane[1] = y;
1471         command->clipplane[2] = z;
1472         command->clipplane[3] = w;
1473 }
1474
1475 #ifdef SSE_POSSIBLE
1476 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1477 {
1478         float *end = dst + size*4;
1479         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1480         {
1481                 while (dst < end)
1482                 {
1483                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1484                         dst += 4;
1485                         src += stride;
1486                 }
1487         }
1488         else
1489         {
1490                 while (dst < end)
1491                 {
1492                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1493                         dst += 4;
1494                         src += stride;
1495                 }
1496         }
1497 }
1498
1499 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1500 {
1501         float *end = dst + size*4;
1502         if (stride == sizeof(float[3]))
1503         {
1504                 float *end4 = dst + (size&~3)*4;        
1505                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1506                 {
1507                         while (dst < end4)
1508                         {
1509                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1510                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1511                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1512                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1513                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1514                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1517                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1518                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1519                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1520                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1521                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1522                                 dst += 16;
1523                                 src += 4*sizeof(float[3]);
1524                         }
1525                 }
1526                 else
1527                 {
1528                         while (dst < end4)
1529                         {
1530                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1531                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1532                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1533                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1534                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1535                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1538                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1539                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1540                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1541                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1542                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1543                                 dst += 16;
1544                                 src += 4*sizeof(float[3]);
1545                         }
1546                 }
1547         }
1548         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1549         {
1550                 while (dst < end)
1551                 {
1552                         __m128 v = _mm_loadu_ps((const float *)src);
1553                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1554                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1555                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1556                         _mm_store_ps(dst, v);
1557                         dst += 4;
1558                         src += stride;
1559                 }
1560         }
1561         else
1562         {
1563                 while (dst < end)
1564                 {
1565                         __m128 v = _mm_load_ps((const float *)src);
1566                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1567                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1568                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1569                         _mm_store_ps(dst, v);
1570                         dst += 4;
1571                         src += stride;
1572                 }
1573         }
1574 }
1575
1576 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1577 {
1578         float *end = dst + size*4;
1579         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1580         if (stride == sizeof(float[2]))
1581         {
1582                 float *end2 = dst + (size&~1)*4;
1583                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1584                 {
1585                         while (dst < end2)
1586                         {
1587                                 __m128 v = _mm_loadu_ps((const float *)src);
1588                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1589                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1590                                 dst += 8;
1591                                 src += 2*sizeof(float[2]);
1592                         }
1593                 }
1594                 else
1595                 {
1596                         while (dst < end2)
1597                         {
1598                                 __m128 v = _mm_load_ps((const float *)src);
1599                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1600                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1601                                 dst += 8;
1602                                 src += 2*sizeof(float[2]);
1603                         }
1604                 }
1605         }
1606         while (dst < end)
1607         {
1608                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1609                 dst += 4;
1610                 src += stride;
1611         }
1612 }
1613
1614 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1615 {
1616         float *end = dst + size*4;
1617         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1618         if (stride == sizeof(unsigned char[4]))
1619         {
1620                 float *end4 = dst + (size&~3)*4;
1621                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1622                 {
1623                         while (dst < end4)
1624                         {
1625                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1626                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1627                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1628                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1629                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1630                                 dst += 16;
1631                                 src += 4*sizeof(unsigned char[4]);
1632                         }
1633                 }
1634                 else
1635                 {
1636                         while (dst < end4)
1637                         {
1638                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1639                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1640                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1641                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1642                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1643                                 dst += 16;
1644                                 src += 4*sizeof(unsigned char[4]);
1645                         }
1646                 }
1647         }
1648         while (dst < end)
1649         {
1650                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1651                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1652                 dst += 4;
1653                 src += stride;
1654         }
1655 }
1656
1657 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1658 {
1659         float *end = dst + 4*size;
1660         __m128 v = _mm_loadu_ps(src);
1661         while (dst < end)
1662         {
1663                 _mm_store_ps(dst, v);
1664                 dst += 4;
1665         }
1666 }
1667 #endif
1668
1669 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1670 {
1671 #ifdef SSE_POSSIBLE
1672         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1673         __m128 m0, m1, m2, m3;
1674         float *end;
1675         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1676         {
1677                 // fast case for identity matrix
1678                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1679                 return;
1680         }
1681         end = out4f + numitems*4;
1682         m0 = _mm_loadu_ps(inmatrix16f);
1683         m1 = _mm_loadu_ps(inmatrix16f + 4);
1684         m2 = _mm_loadu_ps(inmatrix16f + 8);
1685         m3 = _mm_loadu_ps(inmatrix16f + 12);
1686         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1687         {
1688                 while (out4f < end)
1689                 {
1690                         __m128 v = _mm_loadu_ps(in4f);
1691                         _mm_store_ps(out4f,
1692                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1693                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1694                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1695                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1696                         out4f += 4;
1697                         in4f += 4;
1698                 }
1699         }
1700         else
1701         {
1702                 while (out4f < end)
1703                 {
1704                         __m128 v = _mm_load_ps(in4f);
1705                         _mm_store_ps(out4f,
1706                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1707                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1708                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1709                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1710                         out4f += 4;
1711                         in4f += 4;
1712                 }
1713         }
1714 #endif
1715 }
1716
1717 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1718 {
1719         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1720 }
1721
1722 #ifdef SSE_POSSIBLE
1723 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1724 { \
1725         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1726         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1727         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1728         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1729 }
1730
1731 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1732 { \
1733         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1734         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1735         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1736         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1737 }
1738
1739 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1740 { \
1741         __m128 p = (in); \
1742         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1743                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1744                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1745                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1746 }
1747
1748 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1749 {
1750         int clipmask = 0xFF;
1751         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1752         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1753         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1754         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1755         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1756         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1757         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1758         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1759         #define BBFRONT(k, pos) \
1760         { \
1761                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1762                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1763                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1764                 { \
1765                         __m128 proj; \
1766                         clipmask &= ~(1<<k); \
1767                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1768                         minproj = _mm_min_ss(minproj, proj); \
1769                         maxproj = _mm_max_ss(maxproj, proj); \
1770                 } \
1771         }
1772         BBFRONT(0, minpos); 
1773         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1774         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1775         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1776         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1777         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1778         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1779         BBFRONT(7, maxpos);
1780         #define BBCLIP(k) \
1781         { \
1782                 if (clipmask&(1<<k)) \
1783                 { \
1784                         if (!(clipmask&(1<<(k^1)))) \
1785                         { \
1786                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1787                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1788                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1789                                 minproj = _mm_min_ss(minproj, proj); \
1790                                 maxproj = _mm_max_ss(maxproj, proj); \
1791                         } \
1792                         if (!(clipmask&(1<<(k^2)))) \
1793                         { \
1794                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1795                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1796                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1797                                 minproj = _mm_min_ss(minproj, proj); \
1798                                 maxproj = _mm_max_ss(maxproj, proj); \
1799                         } \
1800                         if (!(clipmask&(1<<(k^4)))) \
1801                         { \
1802                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1803                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1804                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1805                                 minproj = _mm_min_ss(minproj, proj); \
1806                                 maxproj = _mm_max_ss(maxproj, proj); \
1807                         } \
1808                 } \
1809         }
1810         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1811         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1812         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1813         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1814         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1815         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1816         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1817         *starty = _mm_cvttss_si32(maxproj);
1818         *endy = _mm_cvttss_si32(minproj)+1;
1819         return clipmask;
1820 }
1821         
1822 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1823 {
1824         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1825         float *end = out4f + numitems*4;
1826         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1827         __m128 minpos, maxpos;
1828         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1829         {
1830                 minpos = maxpos = _mm_loadu_ps(in4f);
1831                 while (out4f < end)
1832                 {
1833                         __m128 v = _mm_loadu_ps(in4f);
1834                         minpos = _mm_min_ps(minpos, v);
1835                         maxpos = _mm_max_ps(maxpos, v);
1836                         _mm_store_ps(out4f, v);
1837                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1838                         _mm_store_ps(screen4f, v);
1839                         in4f += 4;
1840                         out4f += 4;
1841                         screen4f += 4;
1842                 }
1843         }
1844         else
1845         {
1846                 minpos = maxpos = _mm_load_ps(in4f);
1847                 while (out4f < end)
1848                 {
1849                         __m128 v = _mm_load_ps(in4f);
1850                         minpos = _mm_min_ps(minpos, v);
1851                         maxpos = _mm_max_ps(maxpos, v);
1852                         _mm_store_ps(out4f, v);
1853                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1854                         _mm_store_ps(screen4f, v);
1855                         in4f += 4;
1856                         out4f += 4;
1857                         screen4f += 4;
1858                 }
1859         }
1860         if (starty && endy) 
1861         {
1862                 ALIGN(float minposf[4]);
1863                 ALIGN(float maxposf[4]);
1864                 _mm_store_ps(minposf, minpos);
1865                 _mm_store_ps(maxposf, maxpos);
1866                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1867         }
1868         return 0;
1869 }
1870
1871 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1872 {
1873         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1874         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1875         float *end;
1876         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1877                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1878         end = out4f + numitems*4;
1879         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1880         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1881         m0 = _mm_loadu_ps(inmatrix16f);
1882         m1 = _mm_loadu_ps(inmatrix16f + 4);
1883         m2 = _mm_loadu_ps(inmatrix16f + 8);
1884         m3 = _mm_loadu_ps(inmatrix16f + 12);
1885         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1886         {
1887                 minpos = maxpos = _mm_loadu_ps(in4f);
1888                 while (out4f < end)
1889                 {
1890                         __m128 v = _mm_loadu_ps(in4f);
1891                         minpos = _mm_min_ps(minpos, v);
1892                         maxpos = _mm_max_ps(maxpos, v);
1893                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1894                         _mm_store_ps(out4f, v);
1895                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1896                         _mm_store_ps(screen4f, v);
1897                         in4f += 4;
1898                         out4f += 4;
1899                         screen4f += 4;
1900                 }
1901         }
1902         else
1903         {
1904                 minpos = maxpos = _mm_load_ps(in4f);
1905                 while (out4f < end)
1906                 {
1907                         __m128 v = _mm_load_ps(in4f);
1908                         minpos = _mm_min_ps(minpos, v);
1909                         maxpos = _mm_max_ps(maxpos, v);
1910                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1911                         _mm_store_ps(out4f, v);
1912                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1913                         _mm_store_ps(screen4f, v);
1914                         in4f += 4;
1915                         out4f += 4;
1916                         screen4f += 4;
1917                 }
1918         }
1919         if (starty && endy) 
1920         {
1921                 ALIGN(float minposf[4]);
1922                 ALIGN(float maxposf[4]);
1923                 _mm_store_ps(minposf, minpos);
1924                 _mm_store_ps(maxposf, maxpos);
1925                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1926         }
1927         return 0;
1928 }
1929 #endif
1930
1931 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1932 {
1933 #ifdef SSE_POSSIBLE
1934         float *outf = dpsoftrast.post_array4f[outarray];
1935         const unsigned char *inb;
1936         int firstvertex = dpsoftrast.firstvertex;
1937         int numvertices = dpsoftrast.numvertices;
1938         int stride;
1939         switch(inarray)
1940         {
1941         case DPSOFTRAST_ARRAY_POSITION:
1942                 stride = dpsoftrast.stride_vertex;
1943                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1944                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1945                 break;
1946         case DPSOFTRAST_ARRAY_COLOR:
1947                 stride = dpsoftrast.stride_color;
1948                 if (dpsoftrast.pointer_color4f)
1949                 {
1950                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1951                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1952                 }
1953                 else if (dpsoftrast.pointer_color4ub)
1954                 {
1955                         stride = dpsoftrast.stride_color;
1956                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1957                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1958                 }
1959                 else
1960                 {
1961                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1962                 }
1963                 break;
1964         default:
1965                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1966                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1967                 {
1968                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1969                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1970                         {
1971                         case 2:
1972                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1973                                 break;
1974                         case 3:
1975                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1976                                 break;
1977                         case 4:
1978                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1979                                 break;
1980                         }
1981                 }
1982                 break;
1983         }
1984         return outf;
1985 #else
1986         return NULL;
1987 #endif
1988 }
1989
1990 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1991 {
1992         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1993         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1994         return data;
1995 }
1996
1997 #if 0
1998 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1999 {
2000 #ifdef SSE_POSSIBLE
2001         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2002         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
2003         return data;
2004 #else
2005         return NULL;
2006 #endif
2007 }
2008 #endif
2009
2010 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2011 {
2012 #ifdef SSE_POSSIBLE
2013         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2014         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2015         return data;
2016 #else
2017         return NULL;
2018 #endif
2019 }
2020
2021 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2022 {
2023         int x;
2024         int startx = span->startx;
2025         int endx = span->endx;
2026         float wslope = triangle->w[0];
2027         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2028         float endz = 1.0f / (w + wslope * startx);
2029         if (triangle->w[0] == 0)
2030         {
2031                 // LordHavoc: fast flat polygons (HUD/menu)
2032                 for (x = startx;x < endx;x++)
2033                         zf[x] = endz;
2034                 return;
2035         }
2036         for (x = startx;x < endx;)
2037         {
2038                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2039                 float z = endz, dz;
2040                 if (nextsub >= endx) nextsub = endsub = endx-1;
2041                 endz = 1.0f / (w + wslope * nextsub);
2042                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2043                 for (; x <= endsub; x++, z += dz)
2044                         zf[x] = z;
2045         }
2046 }
2047
2048 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2049 {
2050 #ifdef SSE_POSSIBLE
2051         int x;
2052         int startx = span->startx;
2053         int endx = span->endx;
2054         int maskx;
2055         int subx;
2056         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2057         unsigned char * RESTRICT pixelmask = span->pixelmask;
2058         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2059         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2060         if (!pixel)
2061                 return;
2062         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2063         pixeli += span->y * dpsoftrast.fb_width + span->x;
2064         // handle alphatest now (this affects depth writes too)
2065         if (thread->alphatest)
2066                 for (x = startx;x < endx;x++)
2067                         if (in4ub[x*4+3] < 128)
2068                                 pixelmask[x] = false;
2069         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2070         // helps sprites, text and hud artwork
2071         switch(thread->fb_blendmode)
2072         {
2073         case DPSOFTRAST_BLENDMODE_ALPHA:
2074         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2075         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2076                 maskx = startx;
2077                 for (x = startx;x < endx;x++)
2078                 {
2079                         if (in4ub[x*4+3] >= 1)
2080                         {
2081                                 startx = x;
2082                                 for (;;)
2083                                 {
2084                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2085                                         maskx = x;
2086                                         if (x >= endx) break;
2087                                         ++x;
2088                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2089                                         if (x >= endx) break;
2090                                 }
2091                                 break;
2092                         }
2093                 }
2094                 endx = maskx;
2095                 break;
2096         case DPSOFTRAST_BLENDMODE_OPAQUE:
2097         case DPSOFTRAST_BLENDMODE_ADD:
2098         case DPSOFTRAST_BLENDMODE_INVMOD:
2099         case DPSOFTRAST_BLENDMODE_MUL:
2100         case DPSOFTRAST_BLENDMODE_MUL2:
2101         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2102         case DPSOFTRAST_BLENDMODE_INVADD:
2103                 break;
2104         }
2105         // put some special values at the end of the mask to ensure the loops end
2106         pixelmask[endx] = 1;
2107         pixelmask[endx+1] = 0;
2108         // LordHavoc: use a double loop to identify subspans, this helps the
2109         // optimized copy/blend loops to perform at their best, most triangles
2110         // have only one run of pixels, and do the search using wide reads...
2111         x = startx;
2112         while (x < endx)
2113         {
2114                 // if this pixel is masked off, it's probably not alone...
2115                 if (!pixelmask[x])
2116                 {
2117                         x++;
2118 #if 1
2119                         if (x + 8 < endx)
2120                         {
2121                                 // the 4-item search must be aligned or else it stalls badly
2122                                 if ((x & 3) && !pixelmask[x]) 
2123                                 {
2124                                         if(pixelmask[x]) goto endmasked;
2125                                         x++;
2126                                         if (x & 3)
2127                                         {
2128                                                 if(pixelmask[x]) goto endmasked;
2129                                                 x++;
2130                                                 if (x & 3)
2131                                                 {
2132                                                         if(pixelmask[x]) goto endmasked;
2133                                                         x++;
2134                                                 }
2135                                         }
2136                                 }
2137                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2138                                         x += 4;
2139                         }
2140 #endif
2141                         for (;!pixelmask[x];x++)
2142                                 ;
2143                         // rather than continue the loop, just check the end variable
2144                         if (x >= endx)
2145                                 break;
2146                 }
2147         endmasked:
2148                 // find length of subspan
2149                 subx = x + 1;
2150 #if 1
2151                 if (subx + 8 < endx)
2152                 {
2153                         if (subx & 3)
2154                         {
2155                                 if(!pixelmask[subx]) goto endunmasked;
2156                                 subx++;
2157                                 if (subx & 3)
2158                                 {
2159                                         if(!pixelmask[subx]) goto endunmasked;
2160                                         subx++;
2161                                         if (subx & 3)
2162                                         {
2163                                                 if(!pixelmask[subx]) goto endunmasked;
2164                                                 subx++;
2165                                         }
2166                                 }
2167                         }
2168                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2169                                 subx += 4;
2170                 }
2171 #endif
2172                 for (;pixelmask[subx];subx++)
2173                         ;
2174                 // the checks can overshoot, so make sure to clip it...
2175                 if (subx > endx)
2176                         subx = endx;
2177         endunmasked:
2178                 // now that we know the subspan length...  process!
2179                 switch(thread->fb_blendmode)
2180                 {
2181                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2182 #if 0
2183                         if (subx - x >= 16)
2184                         {
2185                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2186                                 x = subx;
2187                         }
2188                         else
2189 #elif 1
2190                         while (x + 16 <= subx)
2191                         {
2192                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2193                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2194                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2195                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2196                                 x += 16;
2197                         }
2198 #endif
2199                         {
2200                                 while (x + 4 <= subx)
2201                                 {
2202                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2203                                         x += 4;
2204                                 }
2205                                 if (x + 2 <= subx)
2206                                 {
2207                                         pixeli[x] = ini[x];
2208                                         pixeli[x+1] = ini[x+1];
2209                                         x += 2;
2210                                 }
2211                                 if (x < subx)
2212                                 {
2213                                         pixeli[x] = ini[x];
2214                                         x++;
2215                                 }
2216                         }
2217                         break;
2218                 case DPSOFTRAST_BLENDMODE_ALPHA:
2219                 #define FINISHBLEND(blend2, blend1) \
2220                         for (;x + 1 < subx;x += 2) \
2221                         { \
2222                                 __m128i src, dst; \
2223                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2224                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2225                                 blend2; \
2226                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2227                         } \
2228                         if (x < subx) \
2229                         { \
2230                                 __m128i src, dst; \
2231                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2232                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2233                                 blend1; \
2234                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2235                                 x++; \
2236                         }
2237                         FINISHBLEND({
2238                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2239                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2240                         }, {
2241                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2242                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2243                         });
2244                         break;
2245                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2246                         FINISHBLEND({
2247                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2248                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2249                         }, {
2250                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2251                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2252                         });
2253                         break;
2254                 case DPSOFTRAST_BLENDMODE_ADD:
2255                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2256                         break;
2257                 case DPSOFTRAST_BLENDMODE_INVMOD:
2258                         FINISHBLEND({
2259                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2260                         }, {
2261                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2262                         });
2263                         break;
2264                 case DPSOFTRAST_BLENDMODE_MUL:
2265                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2266                         break;
2267                 case DPSOFTRAST_BLENDMODE_MUL2:
2268                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2269                         break;
2270                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2271                         FINISHBLEND({
2272                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2273                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2274                         }, {
2275                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2276                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2277                         });
2278                         break;
2279                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2280                         FINISHBLEND({
2281                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2282                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2283                         }, {
2284                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2285                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2286                         });
2287                         break;
2288                 case DPSOFTRAST_BLENDMODE_INVADD:
2289                         FINISHBLEND({
2290                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2291                         }, {
2292                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2293                         });
2294                         break;
2295                 }
2296         }
2297 #endif
2298 }
2299
2300 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2301         // warning: this is SLOW, only use if the optimized per-span functions won't do
2302 {
2303         const unsigned char * RESTRICT pixelbase;
2304         const unsigned char * RESTRICT pixel[4];
2305         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2306         int wrapmask[2] = { width-1, height-1 };
2307         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2308         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2309         {
2310                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2311                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2312                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2313                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2314                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2315                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2316                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2317                 {
2318                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2319                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2320                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2321                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2322                 }
2323                 else
2324                 {
2325                         tci[0] &= wrapmask[0];
2326                         tci[1] &= wrapmask[1];
2327                         tci1[0] &= wrapmask[0];
2328                         tci1[1] &= wrapmask[1];
2329                 }
2330                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2331                 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2332                 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2333                 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2334                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2335                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2336                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2337                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2338         }
2339         else
2340         {
2341                 int tci[2] = { x * width, y * height };
2342                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2343                 {
2344                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2345                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2346                 }
2347                 else
2348                 {
2349                         tci[0] &= wrapmask[0];
2350                         tci[1] &= wrapmask[1];
2351                 }
2352                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2353                 c[0] = pixel[0][0];
2354                 c[1] = pixel[0][1];
2355                 c[2] = pixel[0][2];
2356                 c[3] = pixel[0][3];
2357         }
2358 }
2359
2360 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2361 {
2362         int x;
2363         int startx = span->startx;
2364         int endx = span->endx;
2365         int flags;
2366         float c[4];
2367         float data[4];
2368         float slope[4];
2369         float tc[2], endtc[2];
2370         float tcscale[2];
2371         unsigned int tci[2];
2372         unsigned int tci1[2];
2373         unsigned int tcimin[2];
2374         unsigned int tcimax[2];
2375         int tciwrapmask[2];
2376         int tciwidth;
2377         int filter;
2378         int mip;
2379         const unsigned char * RESTRICT pixelbase;
2380         const unsigned char * RESTRICT pixel[4];
2381         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2382         // if no texture is bound, just fill it with white
2383         if (!texture)
2384         {
2385                 for (x = startx;x < endx;x++)
2386                 {
2387                         out4f[x*4+0] = 1.0f;
2388                         out4f[x*4+1] = 1.0f;
2389                         out4f[x*4+2] = 1.0f;
2390                         out4f[x*4+3] = 1.0f;
2391                 }
2392                 return;
2393         }
2394         mip = triangle->mip[texunitindex];
2395         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2396         // if this mipmap of the texture is 1 pixel, just fill it with that color
2397         if (texture->mipmap[mip][1] == 4)
2398         {
2399                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2400                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2401                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2402                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2403                 for (x = startx;x < endx;x++)
2404                 {
2405                         out4f[x*4+0] = c[0];
2406                         out4f[x*4+1] = c[1];
2407                         out4f[x*4+2] = c[2];
2408                         out4f[x*4+3] = c[3];
2409                 }
2410                 return;
2411         }
2412         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2413         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2414         flags = texture->flags;
2415         tcscale[0] = texture->mipmap[mip][2];
2416         tcscale[1] = texture->mipmap[mip][3];
2417         tciwidth = texture->mipmap[mip][2];
2418         tcimin[0] = 0;
2419         tcimin[1] = 0;
2420         tcimax[0] = texture->mipmap[mip][2]-1;
2421         tcimax[1] = texture->mipmap[mip][3]-1;
2422         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2423         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2424         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2425         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2426         if (filter)
2427         {
2428                 endtc[0] -= 0.5f;
2429                 endtc[1] -= 0.5f;
2430         }
2431         for (x = startx;x < endx;)
2432         {
2433                 unsigned int subtc[2];
2434                 unsigned int substep[2];
2435                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2436                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2437                 if (nextsub >= endx)
2438                 {
2439                         nextsub = endsub = endx-1;      
2440                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2441                 }
2442                 tc[0] = endtc[0];
2443                 tc[1] = endtc[1];
2444                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2445                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2446                 if (filter)
2447                 {
2448                         endtc[0] -= 0.5f;
2449                         endtc[1] -= 0.5f;
2450                 }
2451                 substep[0] = (endtc[0] - tc[0]) * subscale;
2452                 substep[1] = (endtc[1] - tc[1]) * subscale;
2453                 subtc[0] = tc[0] * (1<<12);
2454                 subtc[1] = tc[1] * (1<<12);
2455                 if (filter)
2456                 {
2457                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2458                         {
2459                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2460                                 {
2461                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2462                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2463                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2464                                         tci[0] = subtc[0]>>12;
2465                                         tci[1] = subtc[1]>>12;
2466                                         tci1[0] = tci[0] + 1;
2467                                         tci1[1] = tci[1] + 1;
2468                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2469                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2470                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2471                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2472                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2473                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2474                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2475                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2476                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2477                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2478                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2479                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2480                                         out4f[x*4+0] = c[0];
2481                                         out4f[x*4+1] = c[1];
2482                                         out4f[x*4+2] = c[2];
2483                                         out4f[x*4+3] = c[3];
2484                                 }
2485                         }
2486                         else
2487                         {
2488                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2489                                 {
2490                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2491                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2492                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2493                                         tci[0] = subtc[0]>>12;
2494                                         tci[1] = subtc[1]>>12;
2495                                         tci1[0] = tci[0] + 1;
2496                                         tci1[1] = tci[1] + 1;
2497                                         tci[0] &= tciwrapmask[0];
2498                                         tci[1] &= tciwrapmask[1];
2499                                         tci1[0] &= tciwrapmask[0];
2500                                         tci1[1] &= tciwrapmask[1];
2501                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2502                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2503                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2504                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2505                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2506                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2507                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2508                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2509                                         out4f[x*4+0] = c[0];
2510                                         out4f[x*4+1] = c[1];
2511                                         out4f[x*4+2] = c[2];
2512                                         out4f[x*4+3] = c[3];
2513                                 }
2514                         }
2515                 }
2516                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2517                 {
2518                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2519                         {
2520                                 tci[0] = subtc[0]>>12;
2521                                 tci[1] = subtc[1]>>12;
2522                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2523                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2524                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2525                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2526                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2527                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2528                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2529                                 out4f[x*4+0] = c[0];
2530                                 out4f[x*4+1] = c[1];
2531                                 out4f[x*4+2] = c[2];
2532                                 out4f[x*4+3] = c[3];
2533                         }
2534                 }
2535                 else
2536                 {
2537                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2538                         {
2539                                 tci[0] = subtc[0]>>12;
2540                                 tci[1] = subtc[1]>>12;
2541                                 tci[0] &= tciwrapmask[0];
2542                                 tci[1] &= tciwrapmask[1];
2543                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2544                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2545                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2546                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2547                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2548                                 out4f[x*4+0] = c[0];
2549                                 out4f[x*4+1] = c[1];
2550                                 out4f[x*4+2] = c[2];
2551                                 out4f[x*4+3] = c[3];
2552                         }
2553                 }
2554         }
2555 }
2556
2557 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2558 {
2559 #ifdef SSE_POSSIBLE
2560         int x;
2561         int startx = span->startx;
2562         int endx = span->endx;
2563         int flags;
2564         __m128 data, slope, tcscale;
2565         __m128i tcsize, tcmask, tcoffset, tcmax;
2566         __m128 tc, endtc;
2567         __m128i subtc, substep, endsubtc;
2568         int filter;
2569         int mip;
2570         int affine; // LordHavoc: optimized affine texturing case
2571         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2572         const unsigned char * RESTRICT pixelbase;
2573         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2574         // if no texture is bound, just fill it with white
2575         if (!texture)
2576         {
2577                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2578                 return;
2579         }
2580         mip = triangle->mip[texunitindex];
2581         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2582         // if this mipmap of the texture is 1 pixel, just fill it with that color
2583         if (texture->mipmap[mip][1] == 4)
2584         {
2585                 unsigned int k = *((const unsigned int *)pixelbase);
2586                 for (x = startx;x < endx;x++)
2587                         outi[x] = k;
2588                 return;
2589         }
2590         affine = zf[startx] == zf[endx-1];
2591         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2592         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2593         flags = texture->flags;
2594         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2595         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2596         tcscale = _mm_cvtepi32_ps(tcsize);
2597         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2598         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2599         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2600         if (filter)
2601                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2602         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2603         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2604         tcmax = _mm_packs_epi32(tcmask, tcmask);
2605         for (x = startx;x < endx;)
2606         {
2607                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2608                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2609                 if (nextsub >= endx || affine)
2610                 {
2611                         nextsub = endsub = endx-1;
2612                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2613                 }       
2614                 tc = endtc;
2615                 subtc = endsubtc;
2616                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2617                 if (filter)
2618                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2619                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2620                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2621                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2622                 substep = _mm_slli_epi32(substep, 1);
2623                 if (filter)
2624                 {
2625                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2626                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2627                         {
2628                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2629                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2630                                 {
2631                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2632                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2633                                         tci = _mm_madd_epi16(tci, tcoffset);
2634                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2635                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2636                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2637                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2638                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2639                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2640                                         fracm = _mm_srli_epi16(subtc, 1);
2641                                         pix1 = _mm_add_epi16(pix1,
2642                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2643                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2644                                         pix3 = _mm_add_epi16(pix3,
2645                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2646                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2647                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2648                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2649                                         pix2 = _mm_add_epi16(pix2,
2650                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2651                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2652                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2653                                 }
2654                                 if (x <= endsub)
2655                                 {
2656                                         const unsigned char * RESTRICT ptr1;
2657                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2658                                         tci = _mm_madd_epi16(tci, tcoffset);
2659                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2660                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2661                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2662                                         fracm = _mm_srli_epi16(subtc, 1);
2663                                         pix1 = _mm_add_epi16(pix1,
2664                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2665                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2666                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2667                                         pix1 = _mm_add_epi16(pix1,
2668                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2669                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2670                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2671                                         x++;
2672                                 }
2673                         }
2674                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2675                         {
2676                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2677                                 {
2678                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2679                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2680                                         tci = _mm_madd_epi16(tci, tcoffset);
2681                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2682                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2683                                                                                         _mm_setzero_si128());
2684                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2685                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2686                                                                                         _mm_setzero_si128());
2687                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2688                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2689                                         tci = _mm_madd_epi16(tci, tcoffset);
2690                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2691                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2692                                                                                         _mm_setzero_si128());
2693                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2694                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2695                                                                                         _mm_setzero_si128());
2696                                         fracm = _mm_srli_epi16(subtc, 1);
2697                                         pix1 = _mm_add_epi16(pix1,
2698                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2699                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2700                                         pix3 = _mm_add_epi16(pix3,
2701                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2702                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2703                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2704                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2705                                         pix2 = _mm_add_epi16(pix2,
2706                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2707                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2708                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2709                                 }
2710                                 if (x <= endsub)
2711                                 {
2712                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2713                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2714                                         tci = _mm_madd_epi16(tci, tcoffset);
2715                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2716                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2717                                                                                         _mm_setzero_si128());
2718                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2719                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2720                                                                                         _mm_setzero_si128());
2721                                         fracm = _mm_srli_epi16(subtc, 1);
2722                                         pix1 = _mm_add_epi16(pix1,
2723                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2724                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2725                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2726                                         pix1 = _mm_add_epi16(pix1,
2727                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2728                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2729                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2730                                         x++;
2731                                 }
2732                         }
2733                         else
2734                         {
2735                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2736                                 {
2737                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2738                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2739                                         tci = _mm_madd_epi16(tci, tcoffset);
2740                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2741                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2742                                                                                         _mm_setzero_si128());
2743                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2744                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2745                                                                                         _mm_setzero_si128());
2746                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2747                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2748                                         tci = _mm_madd_epi16(tci, tcoffset);
2749                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2750                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2751                                                                                         _mm_setzero_si128());
2752                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2753                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2754                                                                                         _mm_setzero_si128());
2755                                         fracm = _mm_srli_epi16(subtc, 1);
2756                                         pix1 = _mm_add_epi16(pix1,
2757                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2758                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2759                                         pix3 = _mm_add_epi16(pix3,
2760                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2761                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2762                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2763                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2764                                         pix2 = _mm_add_epi16(pix2,
2765                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2766                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2767                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2768                                 }
2769                                 if (x <= endsub)
2770                                 {
2771                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2772                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2773                                         tci = _mm_madd_epi16(tci, tcoffset);
2774                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2775                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2776                                                                                         _mm_setzero_si128());
2777                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2778                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2779                                                                                         _mm_setzero_si128());
2780                                         fracm = _mm_srli_epi16(subtc, 1);
2781                                         pix1 = _mm_add_epi16(pix1,
2782                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2783                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2784                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2785                                         pix1 = _mm_add_epi16(pix1,
2786                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2787                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2788                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2789                                         x++;
2790                                 }
2791                         }
2792                 }
2793                 else
2794                 {
2795                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2796                         {
2797                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2798                                 {
2799                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2800                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2801                                         tci = _mm_madd_epi16(tci, tcoffset);
2802                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2803                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2804                                 }
2805                                 if (x <= endsub)
2806                                 {
2807                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2808                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2809                                         tci = _mm_madd_epi16(tci, tcoffset);
2810                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2811                                         x++;
2812                                 }
2813                         }
2814                         else
2815                         {
2816                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2817                                 {
2818                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2819                                         tci = _mm_and_si128(tci, tcmax); 
2820                                         tci = _mm_madd_epi16(tci, tcoffset);
2821                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2822                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2823                                 }
2824                                 if (x <= endsub)
2825                                 {
2826                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2827                                         tci = _mm_and_si128(tci, tcmax); 
2828                                         tci = _mm_madd_epi16(tci, tcoffset);
2829                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2830                                         x++;
2831                                 }
2832                         }
2833                 }
2834         }
2835 #endif
2836 }
2837
2838 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2839 {
2840         // TODO: IMPLEMENT
2841         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2842 }
2843
2844 float DPSOFTRAST_SampleShadowmap(const float *vector)
2845 {
2846         // TODO: IMPLEMENT
2847         return 1.0f;
2848 }
2849
2850 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2851 {
2852         int x;
2853         int startx = span->startx;
2854         int endx = span->endx;
2855         float c[4];
2856         float data[4];
2857         float slope[4];
2858         float z;
2859         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2860         for (x = startx;x < endx;x++)
2861         {
2862                 z = zf[x];
2863                 c[0] = (data[0] + slope[0]*x) * z;
2864                 c[1] = (data[1] + slope[1]*x) * z;
2865                 c[2] = (data[2] + slope[2]*x) * z;
2866                 c[3] = (data[3] + slope[3]*x) * z;
2867                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2868                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2869                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2870                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2871         }
2872 }
2873
2874 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2875 {
2876         int x;
2877         int startx = span->startx;
2878         int endx = span->endx;
2879         float c[4];
2880         float data[4];
2881         float slope[4];
2882         float z;
2883         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2884         for (x = startx;x < endx;x++)
2885         {
2886                 z = zf[x];
2887                 c[0] = (data[0] + slope[0]*x) * z;
2888                 c[1] = (data[1] + slope[1]*x) * z;
2889                 c[2] = (data[2] + slope[2]*x) * z;
2890                 c[3] = (data[3] + slope[3]*x) * z;
2891                 out4f[x*4+0] = c[0];
2892                 out4f[x*4+1] = c[1];
2893                 out4f[x*4+2] = c[2];
2894                 out4f[x*4+3] = c[3];
2895         }
2896 }
2897
2898 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2899 {
2900         int x, startx = span->startx, endx = span->endx;
2901         float c[4], localcolor[4];
2902         localcolor[0] = subcolor[0];
2903         localcolor[1] = subcolor[1];
2904         localcolor[2] = subcolor[2];
2905         localcolor[3] = subcolor[3];
2906         for (x = startx;x < endx;x++)
2907         {
2908                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2909                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2910                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2911                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2912                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2913                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2914                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2915                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2916         }
2917 }
2918
2919 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2920 {
2921         int x, startx = span->startx, endx = span->endx;
2922         for (x = startx;x < endx;x++)
2923         {
2924                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2925                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2926                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2927                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2928         }
2929 }
2930
2931 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2932 {
2933         int x, startx = span->startx, endx = span->endx;
2934         for (x = startx;x < endx;x++)
2935         {
2936                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2937                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2938                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2939                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2940         }
2941 }
2942
2943 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2944 {
2945         int x, startx = span->startx, endx = span->endx;
2946         float a, b;
2947         for (x = startx;x < endx;x++)
2948         {
2949                 a = 1.0f - inb4f[x*4+3];
2950                 b = inb4f[x*4+3];
2951                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2952                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2953                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2954                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2955         }
2956 }
2957
2958 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2959 {
2960         int x, startx = span->startx, endx = span->endx;
2961         float localcolor[4], ilerp, lerp;
2962         localcolor[0] = color[0];
2963         localcolor[1] = color[1];
2964         localcolor[2] = color[2];
2965         localcolor[3] = color[3];
2966         ilerp = 1.0f - localcolor[3];
2967         lerp = localcolor[3];
2968         for (x = startx;x < endx;x++)
2969         {
2970                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2971                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2972                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2973                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2974         }
2975 }
2976
2977
2978
2979 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2980 {
2981 #ifdef SSE_POSSIBLE
2982         int x;
2983         int startx = span->startx;
2984         int endx = span->endx;
2985         __m128 data, slope;
2986         __m128 mod, endmod;
2987         __m128i submod, substep, endsubmod;
2988         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2989         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2990         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2991         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2992         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2993         for (x = startx; x < endx;)
2994         {
2995                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2996                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2997                 if (nextsub >= endx)
2998                 {
2999                         nextsub = endsub = endx-1;
3000                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3001                 }
3002                 mod = endmod;
3003                 submod = endsubmod;
3004                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3005                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3006                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3007                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3008                 substep = _mm_packs_epi32(substep, substep);
3009                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3010                 {
3011                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3012                         pix = _mm_mulhi_epu16(pix, submod);
3013                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3014                 }
3015                 if (x <= endsub)
3016                 {
3017                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3018                         pix = _mm_mulhi_epu16(pix, submod);
3019                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3020                         x++;
3021                 }
3022         }
3023 #endif
3024 }
3025
3026 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3027 {
3028 #ifdef SSE_POSSIBLE
3029         int x;
3030         int startx = span->startx;
3031         int endx = span->endx;
3032         __m128 data, slope;
3033         __m128 mod, endmod;
3034         __m128i submod, substep, endsubmod;
3035         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3036         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3037         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3038         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3039         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3040         for (x = startx; x < endx;)
3041         {
3042                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3043                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3044                 if (nextsub >= endx)
3045                 {
3046                         nextsub = endsub = endx-1;
3047                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3048                 }
3049                 mod = endmod;
3050                 submod = endsubmod;
3051                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3052                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3053                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3054                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3055                 substep = _mm_packs_epi32(substep, substep);
3056                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3057                 {
3058                         __m128i pix = _mm_srai_epi16(submod, 4);
3059                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3060                 }
3061                 if (x <= endsub)
3062                 {
3063                         __m128i pix = _mm_srai_epi16(submod, 4);
3064                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3065                         x++;
3066                 }
3067         }
3068 #endif
3069 }
3070
3071 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3072 {
3073 #ifdef SSE_POSSIBLE
3074         int x, startx = span->startx, endx = span->endx;
3075         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3076         localcolor = _mm_packs_epi32(localcolor, localcolor);
3077         for (x = startx;x+2 <= endx;x+=2)
3078         {
3079                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3080                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3081                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3082                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3083         }
3084         if (x < endx)
3085         {
3086                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3087                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3088                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3089                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3090         }
3091 #endif
3092 }
3093
3094 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3095 {
3096 #ifdef SSE_POSSIBLE
3097         int x, startx = span->startx, endx = span->endx;
3098         for (x = startx;x+2 <= endx;x+=2)
3099         {
3100                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3101                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3102                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3103                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3104         }
3105         if (x < endx)
3106         {
3107                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3108                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3109                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3110                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3111         }
3112 #endif
3113 }
3114
3115 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3116 {
3117 #ifdef SSE_POSSIBLE
3118         int x, startx = span->startx, endx = span->endx;
3119         for (x = startx;x+2 <= endx;x+=2)
3120         {
3121                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3122                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3123                 pix1 = _mm_add_epi16(pix1, pix2);
3124                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3125         }
3126         if (x < endx)
3127         {
3128                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3129                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3130                 pix1 = _mm_add_epi16(pix1, pix2);
3131                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3132         }
3133 #endif
3134 }
3135
3136 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3137 {
3138 #ifdef SSE_POSSIBLE
3139         int x, startx = span->startx, endx = span->endx;
3140         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3141         tint = _mm_packs_epi32(tint, tint);
3142         for (x = startx;x+2 <= endx;x+=2)
3143         {
3144                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3145                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3146                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3147                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3148         }
3149         if (x < endx)
3150         {
3151                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3152                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3153                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3154                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3155         }
3156 #endif
3157 }
3158
3159 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3160 {
3161 #ifdef SSE_POSSIBLE
3162         int x, startx = span->startx, endx = span->endx;
3163         for (x = startx;x+2 <= endx;x+=2)
3164         {
3165                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3166                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3167                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3168                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3169                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3170         }
3171         if (x < endx)
3172         {
3173                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3174                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3175                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3176                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3177                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3178         }
3179 #endif
3180 }
3181
3182 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3183 {
3184 #ifdef SSE_POSSIBLE
3185         int x, startx = span->startx, endx = span->endx;
3186         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3187         localcolor = _mm_packs_epi32(localcolor, localcolor);
3188         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3189         for (x = startx;x+2 <= endx;x+=2)
3190         {
3191                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3192                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3193                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3194         }
3195         if (x < endx)
3196         {
3197                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3198                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3199                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3200         }
3201 #endif
3202 }
3203
3204
3205
3206 void DPSOFTRAST_VertexShader_Generic(void)
3207 {
3208         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3209         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3210         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3211         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3212                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3213 }
3214
3215 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3216 {
3217         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3218         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3219         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3220         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3221         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3222         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3223         {
3224                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3225                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3226                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3227                 {
3228                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3229                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3230                         {
3231                                 // multiply
3232                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3233                         }
3234                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3235                         {
3236                                 // add
3237                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3238                         }
3239                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3240                         {
3241                                 // alphablend
3242                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3243                         }
3244                 }
3245         }
3246         else
3247                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3248         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3249 }
3250
3251
3252
3253 void DPSOFTRAST_VertexShader_PostProcess(void)
3254 {
3255         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3256         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3257         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3258 }
3259
3260 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3261 {
3262         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3263         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3264         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3265         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3266         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3267         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3268         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3269         {
3270                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3271                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3272         }
3273         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3274         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3275         {
3276                 // TODO: implement saturation
3277         }
3278         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3279         {
3280                 // TODO: implement gammaramps
3281         }
3282         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3283 }
3284
3285
3286
3287 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3288 {
3289         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3290 }
3291
3292 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3293 {
3294         // this is never called (because colormask is off when this shader is used)
3295         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3296         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3297         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3298         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3299         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3300 }
3301
3302
3303
3304 void DPSOFTRAST_VertexShader_FlatColor(void)
3305 {
3306         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3307         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3308 }
3309
3310 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3311 {
3312 #ifdef SSE_POSSIBLE
3313         unsigned char * RESTRICT pixelmask = span->pixelmask;
3314         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3315         int x, startx = span->startx, endx = span->endx;
3316         __m128i Color_Ambientm;
3317         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3318         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3319         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3320         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3321         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3322         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3323                 pixel = buffer_FragColorbgra8;
3324         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3325         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3326         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3327         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3328         for (x = startx;x < endx;x++)
3329         {
3330                 __m128i color, pix;
3331                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3332                 {
3333                         __m128i pix2;
3334                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3335                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3336                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3337                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3338                         x += 3;
3339                         continue;
3340                 }
3341                 if (!pixelmask[x])
3342                         continue;
3343                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3344                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3345                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3346         }
3347         if (pixel == buffer_FragColorbgra8)
3348                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3349 #endif
3350 }
3351
3352
3353
3354 void DPSOFTRAST_VertexShader_VertexColor(void)
3355 {
3356         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3357         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3358         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3359 }
3360
3361 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3362 {
3363 #ifdef SSE_POSSIBLE
3364         unsigned char * RESTRICT pixelmask = span->pixelmask;
3365         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3366         int x, startx = span->startx, endx = span->endx;
3367         __m128i Color_Ambientm, Color_Diffusem;
3368         __m128 data, slope;
3369         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3370         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3371         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3372         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3373         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3374         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3375         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3376                 pixel = buffer_FragColorbgra8;
3377         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3378         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3379         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3380         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3381         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3382         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3383         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3384         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3385         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3386         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3387         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3388         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3389         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3390         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3391         {
3392                 __m128i color, mod, pix;
3393                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3394                 {
3395                         __m128i pix2, mod2;
3396                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3397                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3398                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3399                         data = _mm_add_ps(data, slope);
3400                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3401                         data = _mm_add_ps(data, slope);
3402                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3403                         data = _mm_add_ps(data, slope);
3404                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3405                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3406                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3407                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3408                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3409                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3410                         x += 3;
3411                         continue;
3412                 }
3413                 if (!pixelmask[x])
3414                         continue;
3415                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3416                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3417                 mod = _mm_packs_epi32(mod, mod);
3418                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3419                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3420         }
3421         if (pixel == buffer_FragColorbgra8)
3422                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3423 #endif
3424 }
3425
3426
3427
3428 void DPSOFTRAST_VertexShader_Lightmap(void)
3429 {
3430         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3431         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3432         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3433 }
3434
3435 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3436 {
3437 #ifdef SSE_POSSIBLE
3438         unsigned char * RESTRICT pixelmask = span->pixelmask;
3439         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3440         int x, startx = span->startx, endx = span->endx;
3441         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3442         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3443         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3444         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3445         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3446         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3447         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3448         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3449         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3450         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3451                 pixel = buffer_FragColorbgra8;
3452         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3453         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3454         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3455         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3456         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3457         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3458         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3459         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3460         {
3461                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3462                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3463                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3464                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3465                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3466                 for (x = startx;x < endx;x++)
3467                 {
3468                         __m128i color, lightmap, glow, pix;
3469                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3470                         {
3471                                 __m128i pix2;
3472                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3473                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3474                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3475                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3476                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3477                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3478                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3479                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3480                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3481                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3482                                 x += 3;
3483                                 continue;
3484                         }
3485                         if (!pixelmask[x])
3486                                 continue;
3487                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3488                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3489                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3490                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3491                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3492                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3493                 }
3494         }
3495         else
3496         {
3497                 for (x = startx;x < endx;x++)
3498                 {
3499                         __m128i color, lightmap, pix;
3500                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3501                         {
3502                                 __m128i pix2;
3503                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3504                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3505                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3506                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3507                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3508                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3509                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3510                                 x += 3;
3511                                 continue;
3512                         }
3513                         if (!pixelmask[x]) 
3514                                 continue;
3515                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3516                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3517                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3518                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3519                 }
3520         }
3521         if (pixel == buffer_FragColorbgra8)
3522                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3523 #endif
3524 }
3525
3526
3527 void DPSOFTRAST_VertexShader_LightDirection(void);
3528 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3529
3530 void DPSOFTRAST_VertexShader_FakeLight(void)
3531 {
3532         DPSOFTRAST_VertexShader_LightDirection();
3533 }
3534
3535 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3536 {
3537         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3538 }
3539
3540
3541
3542 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3543 {
3544         DPSOFTRAST_VertexShader_LightDirection();
3545         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3546 }
3547
3548 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3549 {
3550         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3551 }
3552
3553
3554
3555 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3556 {
3557         DPSOFTRAST_VertexShader_LightDirection();
3558         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3559 }
3560
3561 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3562 {
3563         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3564 }
3565
3566
3567
3568 void DPSOFTRAST_VertexShader_LightDirection(void)
3569 {
3570         int i;
3571         int numvertices = dpsoftrast.numvertices;
3572         float LightDir[4];
3573         float LightVector[4];
3574         float EyePosition[4];
3575         float EyeVectorModelSpace[4];
3576         float EyeVector[4];
3577         float position[4];
3578         float svector[4];
3579         float tvector[4];
3580         float normal[4];
3581         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3582         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3583         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3584         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3585         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3586         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3587         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3588         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3589         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3590         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3591         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3592         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3593         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3594         for (i = 0;i < numvertices;i++)
3595         {
3596                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3597                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3598                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3599                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3600                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3601                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3602                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3603                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3604                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3605                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3606                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3607                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3608                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3609                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3610                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3611                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3612                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3613                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3614                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3615                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3616                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3617                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3618                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3619                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3620                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3621                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3622                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3623                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3624                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3625         }
3626         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3627 }
3628
3629 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3630 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3631 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3632 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3633 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3634 #define DPSOFTRAST_Vector3Normalize(v)\
3635 do\
3636 {\
3637         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3638         if (len)\
3639         {\
3640                 len = 1.0f / len;\
3641                 v[0] *= len;\
3642                 v[1] *= len;\
3643                 v[2] *= len;\
3644         }\
3645 }\
3646 while(0)
3647
3648 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3649 {
3650         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3651         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3652         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3653         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3654         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3655         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3656         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3657         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3658         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3659         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3660         int x, startx = span->startx, endx = span->endx;
3661         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3662         float LightVectordata[4];
3663         float LightVectorslope[4];
3664         float EyeVectordata[4];
3665         float EyeVectorslope[4];
3666         float VectorSdata[4];
3667         float VectorSslope[4];
3668         float VectorTdata[4];
3669         float VectorTslope[4];
3670         float VectorRdata[4];
3671         float VectorRslope[4];
3672         float z;
3673         float diffusetex[4];
3674         float glosstex[4];
3675         float surfacenormal[4];
3676         float lightnormal[4];
3677         float lightnormal_modelspace[4];
3678         float eyenormal[4];
3679         float specularnormal[4];
3680         float diffuse;
3681         float specular;
3682         float SpecularPower;
3683         int d[4];
3684         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3685         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3686         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3687         Color_Glow[3] = 0.0f;
3688         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3689         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3690         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3691         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3692         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3693         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3694         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3695         Color_Pants[3] = 0.0f;
3696         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3697         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3698         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3699         Color_Shirt[3] = 0.0f;
3700         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3701         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3702         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3703         {
3704                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3705                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3706         }
3707         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3708         {
3709                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3710         }
3711         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3712         {
3713                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3714                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3715                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3716                 Color_Diffuse[3] = 0.0f;
3717                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3718                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3719                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3720                 LightColor[3] = 0.0f;
3721                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3722                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3723                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3724                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3725                 Color_Specular[3] = 0.0f;
3726                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3727                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3728                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3729
3730                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3731                 {
3732                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3733                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3734                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3735                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3736                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3737                 }
3738                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3739                 {
3740                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3741                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3742                 }
3743                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3744                 {
3745                         // nothing of this needed
3746                 }
3747                 else
3748                 {
3749                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3750                 }
3751
3752                 for (x = startx;x < endx;x++)
3753                 {
3754                         z = buffer_z[x];
3755                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3756                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3757                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3758                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3759                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3760                         {
3761                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3762                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3763                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3764                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3765                         }
3766                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3767                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3768                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3769                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3770                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3771                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3772                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3773                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3774
3775                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3776                         {
3777                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3778                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3779                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3780                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3781
3782                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3783                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3784                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3785                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3786
3787                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3788                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3789                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3790                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3791
3792                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3793                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3794                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3795                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3796
3797                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3798                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3799
3800                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3801                                 {
3802                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3803                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3804                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3805                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3806                                 }
3807                         }
3808                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3809                         {
3810                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3811                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3812                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3813                                 {
3814                                         float f = 1.0f / 256.0f;
3815                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3816                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3817                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3818                                 }
3819                         }
3820                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3821                         {
3822                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3823                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3824                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3825                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3826
3827                                 LightColor[0] = 1.0;
3828                                 LightColor[1] = 1.0;
3829                                 LightColor[2] = 1.0;
3830                         }
3831                         else
3832                         {
3833                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3834                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3835                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3836                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3837                         }
3838
3839                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3840
3841                         if(thread->shader_exactspecularmath)
3842                         {
3843                                 // reflect lightnormal at surfacenormal, take the negative of that
3844                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3845                                 float f;
3846                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3847                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3848                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3849                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3850
3851                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3852                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3853                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3854                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3855                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3856
3857                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3858                         }
3859                         else
3860                         {
3861                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3862                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3863                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3864                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3865
3866                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3867                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3868                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3869                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3870
3871                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3872                         }
3873
3874                         specular = pow(specular, SpecularPower * glosstex[3]);
3875                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3876                         {
3877                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3878                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3879                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3880                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3881                         }
3882                         else
3883                         {
3884                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3885                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3886                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3887                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3888                         }
3889
3890                         buffer_FragColorbgra8[x*4+0] = d[0];
3891                         buffer_FragColorbgra8[x*4+1] = d[1];
3892                         buffer_FragColorbgra8[x*4+2] = d[2];
3893                         buffer_FragColorbgra8[x*4+3] = d[3];
3894                 }
3895         }
3896         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3897         {
3898                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3899                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3900                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3901                 Color_Diffuse[3] = 0.0f;
3902                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3903                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3904                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3905                 LightColor[3] = 0.0f;
3906                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3907
3908                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3909                 {
3910                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3911                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3912                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3913                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3914                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3915                 }
3916                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3917                 {
3918                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3919                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3920                 }
3921                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3922                 {
3923                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3924                 }
3925                 else
3926                 {
3927                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3928                 }
3929
3930                 for (x = startx;x < endx;x++)
3931                 {
3932                         z = buffer_z[x];
3933                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3934                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3935                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3936                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3937                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3938                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3939                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3940                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3941
3942                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3943                         {
3944                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3945                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3946                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3947                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3948
3949                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3950                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3951                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3952                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3953
3954                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3955                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3956                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3957                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3958
3959                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3960                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3961                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3962                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3963
3964                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3965                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3966
3967                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3968                                 {
3969                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3970                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3971                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3972                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3973                                 }
3974                         }
3975                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3976                         {
3977                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3978                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3979                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3980                                 {
3981                                         float f = 1.0f / 256.0f;
3982                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3983                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3984                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3985                                 }
3986                         }
3987                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3988                         {
3989                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3990                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3991                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3992                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3993
3994                                 LightColor[0] = 1.0;
3995                                 LightColor[1] = 1.0;
3996                                 LightColor[2] = 1.0;
3997                         }
3998                         else
3999                         {
4000                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4001                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4002                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4003                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4004                         }
4005
4006                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4007                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4008                         {
4009                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4010                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4011                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4012                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4013                         }
4014                         else
4015                         {
4016                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4017                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4018                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4019                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4020                         }
4021                         buffer_FragColorbgra8[x*4+0] = d[0];
4022                         buffer_FragColorbgra8[x*4+1] = d[1];
4023                         buffer_FragColorbgra8[x*4+2] = d[2];
4024                         buffer_FragColorbgra8[x*4+3] = d[3];
4025                 }
4026         }
4027         else
4028         {
4029                 for (x = startx;x < endx;x++)
4030                 {
4031                         z = buffer_z[x];
4032                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4033                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4034                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4035                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4036
4037                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4038                         {
4039                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4040                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4041                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4042                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4043                         }
4044                         else
4045                         {
4046                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4047                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4048                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4049                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4050                         }
4051                         buffer_FragColorbgra8[x*4+0] = d[0];
4052                         buffer_FragColorbgra8[x*4+1] = d[1];
4053                         buffer_FragColorbgra8[x*4+2] = d[2];
4054                         buffer_FragColorbgra8[x*4+3] = d[3];
4055                 }
4056         }
4057         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4058 }
4059
4060
4061
4062 void DPSOFTRAST_VertexShader_LightSource(void)
4063 {
4064         int i;
4065         int numvertices = dpsoftrast.numvertices;
4066         float LightPosition[4];
4067         float LightVector[4];
4068         float LightVectorModelSpace[4];
4069         float EyePosition[4];
4070         float EyeVectorModelSpace[4];
4071         float EyeVector[4];
4072         float position[4];
4073         float svector[4];
4074         float tvector[4];
4075         float normal[4];
4076         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4077         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4078         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4079         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4080         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4081         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4082         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4083         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4084         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4085         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4086         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4087         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4088         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4089         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4090         for (i = 0;i < numvertices;i++)
4091         {
4092                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4093                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4094                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4095                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4096                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4097                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4098                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4099                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4100                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4101                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4102                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4103                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4104                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4105                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4106                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4107                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4108                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4109                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4110                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4111                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4112                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4113                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4114                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4115                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4116                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4117                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4118                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4119                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4120                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4121                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4122                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4123                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4124         }
4125         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4126         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4127 }
4128
4129 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4130 {
4131 #ifdef SSE_POSSIBLE
4132         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4133         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4134         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4135         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4136         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4137         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4138         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4139         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4140         int x, startx = span->startx, endx = span->endx;
4141         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4142         float CubeVectordata[4];
4143         float CubeVectorslope[4];
4144         float LightVectordata[4];
4145         float LightVectorslope[4];
4146         float EyeVectordata[4];
4147         float EyeVectorslope[4];
4148         float z;
4149         float diffusetex[4];
4150         float glosstex[4];
4151         float surfacenormal[4];
4152         float lightnormal[4];
4153         float eyenormal[4];
4154         float specularnormal[4];
4155         float diffuse;
4156         float specular;
4157         float SpecularPower;
4158         float CubeVector[4];
4159         float attenuation;
4160         int d[4];
4161         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4162         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4163         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4164         Color_Glow[3] = 0.0f;
4165         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4166         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4167         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4168         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4169         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4170         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4171         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4172         Color_Diffuse[3] = 0.0f;
4173         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4174         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4175         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4176         Color_Specular[3] = 0.0f;
4177         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4178         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4179         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4180         Color_Pants[3] = 0.0f;
4181         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4182         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4183         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4184         Color_Shirt[3] = 0.0f;
4185         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4186         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4187         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4188         LightColor[3] = 0.0f;
4189         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4190         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4191         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4192         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4193         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4194         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4195         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4196         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4197         {
4198                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4199                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4200         }
4201         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4202                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4203         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4204         {
4205                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4206                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4207                 for (x = startx;x < endx;x++)
4208                 {
4209                         z = buffer_z[x];
4210                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4211                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4212                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4213                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4214                         if (attenuation < 0.01f)
4215                                 continue;
4216                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4217                         {
4218                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4219                                 if (attenuation < 0.01f)
4220                                         continue;
4221                         }
4222
4223                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4224                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4225                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4226                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4227                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4228                         {
4229                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4230                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4231                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4232                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4233                         }
4234                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4235                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4236                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4237                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4238                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4239                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4240                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4241                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4242
4243                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4244                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4245                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4246                         DPSOFTRAST_Vector3Normalize(lightnormal);
4247
4248                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4249
4250                         if(thread->shader_exactspecularmath)
4251                         {
4252                                 // reflect lightnormal at surfacenormal, take the negative of that
4253                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4254                                 float f;
4255                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4256                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4257                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4258                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4259
4260                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4261                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4262                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4263                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4264                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4265
4266                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4267                         }
4268                         else
4269                         {
4270                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4271                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4272                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4273                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4274
4275                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4276                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4277                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4278                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4279
4280                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4281                         }
4282                         specular = pow(specular, SpecularPower * glosstex[3]);
4283
4284                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4285                         {
4286                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4287                                 attenuation *= (1.0f / 255.0f);
4288                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4289                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4290                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4291                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4292                         }
4293                         else
4294                         {
4295                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4296                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4297                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4298                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4299                         }
4300                         buffer_FragColorbgra8[x*4+0] = d[0];
4301                         buffer_FragColorbgra8[x*4+1] = d[1];
4302                         buffer_FragColorbgra8[x*4+2] = d[2];
4303                         buffer_FragColorbgra8[x*4+3] = d[3];
4304                 }
4305         }
4306         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4307         {
4308                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4309                 for (x = startx;x < endx;x++)
4310                 {
4311                         z = buffer_z[x];
4312                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4313                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4314                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4315                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4316                         if (attenuation < 0.01f)
4317                                 continue;
4318                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4319                         {
4320                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4321                                 if (attenuation < 0.01f)
4322                                         continue;
4323                         }
4324
4325                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4326                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4327                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4328                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4329                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4330                         {
4331                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4332                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4333                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4334                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4335                         }
4336                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4337                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4338                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4339                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4340
4341                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4342                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4343                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4344                         DPSOFTRAST_Vector3Normalize(lightnormal);
4345
4346                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4347                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4348                         {
4349                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4350                                 attenuation *= (1.0f / 255.0f);
4351                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4352                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4353                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4354                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4355                         }
4356                         else
4357                         {
4358                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4359                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4360                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4361                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4362                         }
4363                         buffer_FragColorbgra8[x*4+0] = d[0];
4364                         buffer_FragColorbgra8[x*4+1] = d[1];
4365                         buffer_FragColorbgra8[x*4+2] = d[2];
4366                         buffer_FragColorbgra8[x*4+3] = d[3];
4367                 }
4368         }
4369         else
4370         {
4371                 for (x = startx;x < endx;x++)
4372                 {
4373                         z = buffer_z[x];
4374                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4375                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4376                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4377                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4378                         if (attenuation < 0.01f)
4379                                 continue;
4380                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4381                         {
4382                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4383                                 if (attenuation < 0.01f)
4384                                         continue;
4385                         }
4386
4387                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4388                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4389                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4390                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4391                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4392                         {
4393                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4394                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4395                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4396                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4397                         }
4398                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4399                         {
4400                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4401                                 attenuation *= (1.0f / 255.0f);
4402                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4403                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4404                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4405                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4406                         }
4407                         else
4408                         {
4409                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4410                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4411                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4412                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4413                         }
4414                         buffer_FragColorbgra8[x*4+0] = d[0];
4415                         buffer_FragColorbgra8[x*4+1] = d[1];
4416                         buffer_FragColorbgra8[x*4+2] = d[2];
4417                         buffer_FragColorbgra8[x*4+3] = d[3];
4418                 }
4419         }
4420         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4421 #endif
4422 }
4423
4424
4425
4426 void DPSOFTRAST_VertexShader_Refraction(void)
4427 {
4428         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4429         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4430         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4431 }
4432
4433 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4434 {
4435         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4436         float z;
4437         int x, startx = span->startx, endx = span->endx;
4438
4439         // texture reads
4440         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4441         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4442
4443         // varyings
4444         float ModelViewProjectionPositiondata[4];
4445         float ModelViewProjectionPositionslope[4];
4446
4447         // uniforms
4448         float ScreenScaleRefractReflect[2];
4449         float ScreenCenterRefractReflect[2];
4450         float DistortScaleRefractReflect[2];
4451         float RefractColor[4];
4452
4453         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4454         if(!texture) return;
4455
4456         // read textures
4457         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4458         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4459
4460         // read varyings
4461         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4462
4463         // read uniforms
4464         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4465         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4466         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4467         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4468         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4469         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4470         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4471         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4472         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4473         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4474
4475         // do stuff
4476         for (x = startx;x < endx;x++)
4477         {
4478                 float SafeScreenTexCoord[2];
4479                 float ScreenTexCoord[2];
4480                 float v[3];
4481                 float iw;
4482                 unsigned char c[4];
4483
4484                 z = buffer_z[x];
4485
4486                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4487                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4488
4489                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4490                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4491                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4492
4493                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4494                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4495                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4496                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4497                 DPSOFTRAST_Vector3Normalize(v);
4498                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4499                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4500
4501                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4502                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4503
4504                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4505                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4506                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4507                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4508         }
4509
4510         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4511 }
4512
4513
4514
4515 void DPSOFTRAST_VertexShader_Water(void)
4516 {
4517         int i;
4518         int numvertices = dpsoftrast.numvertices;
4519         float EyePosition[4];
4520         float EyeVectorModelSpace[4];
4521         float EyeVector[4];
4522         float position[4];
4523         float svector[4];
4524         float tvector[4];
4525         float normal[4];
4526         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4527         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4528         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4529         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4530         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4531         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4532         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4533         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4534         for (i = 0;i < numvertices;i++)
4535         {
4536                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4537                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4538                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4539                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4540                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4541                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4542                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4543                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4544                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4545                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4546                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4547                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4548                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4549                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4550                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4551                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4552                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4553                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4554                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4555                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4556                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4557                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4558         }
4559         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4560         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4561         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4562 }
4563
4564
4565 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4566 {
4567         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4568         float z;
4569         int x, startx = span->startx, endx = span->endx;
4570
4571         // texture reads
4572         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4573         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4574
4575         // varyings
4576         float ModelViewProjectionPositiondata[4];
4577         float ModelViewProjectionPositionslope[4];
4578         float EyeVectordata[4];
4579         float EyeVectorslope[4];
4580
4581         // uniforms
4582         float ScreenScaleRefractReflect[2];
4583         float ScreenCenterRefractReflect[2];
4584         float DistortScaleRefractReflect[2];
4585         float RefractColor[4];
4586         float ReflectColor[4];
4587         float ReflectFactor;
4588         float ReflectOffset;
4589
4590         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4591         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4592         if(!texture_refraction || !texture_reflection) return;
4593
4594         // read textures
4595         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4596         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4597
4598         // read varyings
4599         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4600         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4601
4602         // read uniforms
4603         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4604         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4605         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4606         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4607         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4608         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4609         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4610         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4611         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4612         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4613         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4614         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4615         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4616         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4617         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4618         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4619         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4620         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4621         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4622         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4623         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4624         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4625
4626         // do stuff
4627         for (x = startx;x < endx;x++)
4628         {
4629                 float SafeScreenTexCoord[4];
4630                 float ScreenTexCoord[4];
4631                 float v[3];
4632                 float iw;
4633                 unsigned char c1[4];
4634                 unsigned char c2[4];
4635                 float Fresnel;
4636
4637                 z = buffer_z[x];
4638
4639                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4640                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4641
4642                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4643                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4644                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4645                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4646                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4647
4648                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4649                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4650                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4651                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4652                 DPSOFTRAST_Vector3Normalize(v);
4653                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4654                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4655                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4656                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4657
4658                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4659                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4660                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4661                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4662                 DPSOFTRAST_Vector3Normalize(v);
4663                 Fresnel = 1.0f - v[2];
4664                 Fresnel = min(1.0f, Fresnel);
4665                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4666
4667                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4668                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4669                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4670                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4671
4672                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4673                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4674                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4675                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4676         }
4677
4678         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4679 }
4680
4681
4682
4683 void DPSOFTRAST_VertexShader_ShowDepth(void)
4684 {
4685         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4686 }
4687
4688 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4689 {
4690         // TODO: IMPLEMENT
4691         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4692         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4693         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4694         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4695         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4696 }
4697
4698
4699
4700 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4701 {
4702         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4703 }
4704
4705 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4706 {
4707         // TODO: IMPLEMENT
4708         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4709         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4710         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4711         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4712         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4713 }
4714
4715
4716
4717 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4718 {
4719         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4720 }
4721
4722 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4723 {
4724         // TODO: IMPLEMENT
4725         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4726         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4727         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4728         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4729         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4730 }
4731
4732
4733
4734 typedef struct DPSOFTRAST_ShaderModeInfo_s
4735 {
4736         int lodarrayindex;
4737         void (*Vertex)(void);
4738         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4739         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4740         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4741 }
4742 DPSOFTRAST_ShaderModeInfo;
4743
4744 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4745 {
4746         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4747         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4748         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4749         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4750         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4751         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4752         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4753         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4754         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4755         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4756         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4757         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4758         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4759         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4760         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4761         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4762 };
4763
4764 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4765 {
4766         int x;
4767         int startx;
4768         int endx;
4769         unsigned int *depthpixel;
4770         int depth;
4771         int depthslope;
4772         unsigned int d;
4773         unsigned char *pixelmask;
4774         DPSOFTRAST_State_Triangle *triangle;
4775         triangle = &thread->triangles[span->triangle];
4776         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4777         startx = span->startx;
4778         endx = span->endx;
4779         depth = span->depthbase;
4780         depthslope = span->depthslope;
4781         pixelmask = thread->pixelmaskarray;
4782         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4783         {
4784                 switch(thread->fb_depthfunc)
4785                 {
4786                 default:
4787                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4788                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4789                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4790                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4791                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4792                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4793                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4794                 }
4795                 while (startx < endx && !pixelmask[startx])
4796                         startx++;
4797                 while (endx > startx && !pixelmask[endx-1])
4798                         endx--;
4799         }
4800         else
4801         {
4802                 // no depth testing means we're just dealing with color...
4803                 memset(pixelmask + startx, 1, endx - startx);
4804         }
4805         span->pixelmask = pixelmask;
4806         span->startx = startx;
4807         span->endx = endx;
4808 }
4809
4810 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4811 {
4812         int x, d, depth, depthslope, startx, endx;
4813         const unsigned char *pixelmask;
4814         unsigned int *depthpixel;
4815         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4816         {
4817                 depth = span->depthbase;
4818                 depthslope = span->depthslope;
4819                 pixelmask = span->pixelmask;
4820                 startx = span->startx;
4821                 endx = span->endx;
4822                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4823                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4824                         if (pixelmask[x])
4825                                 depthpixel[x] = d;
4826         }
4827 }
4828
4829 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4830 {
4831         int i;
4832         DPSOFTRAST_State_Triangle *triangle;
4833         DPSOFTRAST_State_Span *span;
4834         for (i = 0; i < thread->numspans; i++)
4835         {
4836                 span = &thread->spans[i];
4837                 triangle = &thread->triangles[span->triangle];
4838                 DPSOFTRAST_Draw_DepthTest(thread, span);
4839                 if (span->startx >= span->endx)
4840                         continue;
4841                 // run pixel shader if appropriate
4842                 // do this before running depthmask code, to allow the pixelshader
4843                 // to clear pixelmask values for alpha testing
4844                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4845                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4846                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4847         }
4848         thread->numspans = 0;
4849 }
4850
4851 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4852
4853 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4854 {
4855 #ifdef SSE_POSSIBLE
4856         int cullface = thread->cullface;
4857         int minx, maxx, miny, maxy;
4858         int miny1, maxy1, miny2, maxy2;
4859         __m128i fbmin, fbmax;
4860         __m128 viewportcenter, viewportscale;
4861         int firstvertex = command->firstvertex;
4862         int numvertices = command->numvertices;
4863         int numtriangles = command->numtriangles;
4864         const int *element3i = command->element3i;
4865         const unsigned short *element3s = command->element3s;
4866         int clipped = command->clipped;
4867         int i;
4868         int j;
4869         int k;
4870         int y;
4871         int e[3];
4872         __m128i screeny;
4873         int starty, endy, bandy;
4874         int numpoints;
4875         int clipcase;
4876         float clipdist[4];
4877         float clip0origin, clip0slope;
4878         int clip0dir;
4879         __m128 triangleedge1, triangleedge2, trianglenormal;
4880         __m128 clipfrac[3];
4881         __m128 screen[4];
4882         DPSOFTRAST_State_Triangle *triangle;
4883         DPSOFTRAST_Texture *texture;
4884         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4885         miny = thread->fb_scissor[1];
4886         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4887         miny1 = bound(miny, thread->miny1, maxy);
4888         maxy1 = bound(miny, thread->maxy1, maxy);
4889         miny2 = bound(miny, thread->miny2, maxy);
4890         maxy2 = bound(miny, thread->maxy2, maxy);
4891         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4892         {
4893                 if (!ATOMIC_DECREMENT(command->refcount))
4894                 {
4895                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4896                                 MM_FREE(command->arrays);
4897                 }
4898                 return;
4899         }
4900         minx = thread->fb_scissor[0];
4901         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4902         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4903         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4904         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4905         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4906         screen[3] = _mm_setzero_ps();
4907         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4908         for (i = 0;i < numtriangles;i++)
4909         {
4910                 const float *screencoord4f = command->arrays;
4911                 const float *arrays = screencoord4f + numvertices*4;
4912
4913                 // generate the 3 edges of this triangle
4914                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4915                 if (element3s)
4916                 {
4917                         e[0] = element3s[i*3+0] - firstvertex;
4918                         e[1] = element3s[i*3+1] - firstvertex;
4919                         e[2] = element3s[i*3+2] - firstvertex;
4920                 }
4921                 else if (element3i)
4922                 {
4923                         e[0] = element3i[i*3+0] - firstvertex;
4924                         e[1] = element3i[i*3+1] - firstvertex;
4925                         e[2] = element3i[i*3+2] - firstvertex;
4926                 }
4927                 else
4928                 {
4929                         e[0] = i*3+0;
4930                         e[1] = i*3+1;
4931                         e[2] = i*3+2;
4932                 }
4933
4934 #define SKIPBACKFACE \
4935                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4936                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4937                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4938                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4939                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4940                 switch(cullface) \
4941                 { \
4942                 case GL_BACK: \
4943                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4944                                 continue; \
4945                         break; \
4946                 case GL_FRONT: \
4947                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4948                                 continue; \
4949                         break; \
4950                 }
4951
4952 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4953                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4954                         { \
4955                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4956                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4957                         }
4958 #define CLIPPEDVERTEXCOPY(k,p1) \
4959                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4960
4961 #define GENATTRIBCOPY(attrib, p1) \
4962                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4963 #define GENATTRIBLERP(attrib, p1, p2) \
4964                 { \
4965                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4966                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4967                 }
4968 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4969                 switch(clipcase) \
4970                 { \
4971                 default: \
4972                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4973                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4974                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4975                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4976                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4977                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4978                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4979                 }
4980
4981                 if (! clipped)
4982                         goto notclipped;
4983
4984                 // calculate distance from nearplane
4985                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4986                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4987                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4988                 if (clipdist[0] >= 0.0f)
4989                 {
4990                         if (clipdist[1] >= 0.0f)
4991                         {
4992                                 if (clipdist[2] >= 0.0f)
4993                                 {
4994                                 notclipped:
4995                                         // triangle is entirely in front of nearplane
4996                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4997                                         SKIPBACKFACE;
4998                                         numpoints = 3;
4999                                         clipcase = 0;
5000                                 }
5001                                 else
5002                                 {
5003                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5004                                         SKIPBACKFACE;
5005                                         numpoints = 4;
5006                                         clipcase = 1;
5007                                 }
5008                         }
5009                         else
5010                         {
5011                                 if (clipdist[2] >= 0.0f)
5012                                 {
5013                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5014                                         SKIPBACKFACE;
5015                                         numpoints = 4;
5016                                         clipcase = 2;
5017                                 }
5018                                 else
5019                                 {
5020                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5021                                         SKIPBACKFACE;
5022                                         numpoints = 3;
5023                                         clipcase = 3;
5024                                 }
5025                         }
5026                 }
5027                 else if (clipdist[1] >= 0.0f)
5028                 {
5029                         if (clipdist[2] >= 0.0f)
5030                         {
5031                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5032                                 SKIPBACKFACE;
5033                                 numpoints = 4;
5034                                 clipcase = 4;
5035                         }
5036                         else
5037                         {
5038                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5039                                 SKIPBACKFACE;
5040                                 numpoints = 3;
5041                                 clipcase = 5;
5042                         }
5043                 }
5044                 else if (clipdist[2] >= 0.0f)
5045                 {
5046                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5047                         SKIPBACKFACE;
5048                         numpoints = 3;
5049                         clipcase = 6;
5050                 }
5051                 else continue; // triangle is entirely behind nearplane
5052
5053                 {
5054                         // calculate integer y coords for triangle points
5055                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5056                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5057                                         screenmin = _mm_min_epi16(screeni, screenir),
5058                                         screenmax = _mm_max_epi16(screeni, screenir);
5059                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5060                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5061                         screenmin = _mm_max_epi16(screenmin, fbmin);
5062                         screenmax = _mm_min_epi16(screenmax, fbmax);
5063                         // skip offscreen triangles
5064                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5065                                 continue;
5066                         starty = _mm_extract_epi16(screenmin, 1);
5067                         endy = _mm_extract_epi16(screenmax, 1)+1;
5068                         if (starty >= maxy1 && endy <= miny2)
5069                                 continue;
5070                         screeny = _mm_srai_epi32(screeni, 16);
5071                 }
5072
5073                 triangle = &thread->triangles[thread->numtriangles];
5074
5075                 // calculate attribute plans for triangle data...
5076                 // okay, this triangle is going to produce spans, we'd better project
5077                 // the interpolants now (this is what gives perspective texturing),
5078                 // this consists of simply multiplying all arrays by the W coord
5079                 // (which is basically 1/Z), which will be undone per-pixel
5080                 // (multiplying by Z again) to get the perspective-correct array
5081                 // values
5082                 {
5083                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5084                         __m128 mipedgescale, mipdensity;
5085                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5086                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5087                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5088                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5089                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5090                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5091                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5092                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5093                         attribedge1 = _mm_sub_ss(w0, w1);
5094                         attribedge2 = _mm_sub_ss(w2, w1);
5095                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5096                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5097                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5098                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5099                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5100                         _mm_store_ss(&triangle->w[0], attribxslope);
5101                         _mm_store_ss(&triangle->w[1], attribyslope);
5102                         _mm_store_ss(&triangle->w[2], attriborigin);
5103                         
5104                         clip0origin = 0;
5105                         clip0slope = 0;
5106                         clip0dir = 0;
5107                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5108                         {
5109                                 float cliporigin, clipxslope, clipyslope;
5110                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5111                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5112                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5113                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5114                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5115                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5116                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5117                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5118                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5119                                 if(clipxslope != 0)
5120                                 {
5121                                         clip0origin = -cliporigin/clipxslope;
5122                                         clip0slope = -clipyslope/clipxslope;
5123                                         clip0dir = clipxslope > 0 ? 1 : -1;
5124                                 }
5125                                 else if(clipyslope > 0)
5126                                 {
5127                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5128                                         clip0slope = dpsoftrast.fb_width;
5129                                         clip0dir = -1;
5130                                 }
5131                                 else if(clipyslope < 0)
5132                                 {
5133                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5134                                         clip0slope = -dpsoftrast.fb_width;
5135                                         clip0dir = -1;
5136                                 }
5137                                 else if(clip0origin < 0) continue;
5138                         }
5139
5140                         mipedgescale = _mm_setzero_ps();
5141                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5142                         {
5143                                 __m128 attrib0, attrib1, attrib2;
5144                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5145                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5146                                         break;
5147                                 arrays += numvertices*4;
5148                                 GENATTRIBS(attrib0, attrib1, attrib2);
5149                                 attriborigin = _mm_mul_ps(attrib1, w1);
5150                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5151                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5152                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5153                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5154                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5155                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5156                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5157                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5158                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5159                                 {
5160                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5161                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5162                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5163                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5164                                 }
5165                         }
5166
5167                         memset(triangle->mip, 0, sizeof(triangle->mip));
5168                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5169                         {
5170                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5171                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5172                                         break;
5173                                 texture = thread->texbound[texunit];
5174                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5175                                 {
5176                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5177                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5178                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5179                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5180                                         // this will be multiplied in the texturing routine by the texture resolution
5181                                         y = _mm_cvtss_si32(mipdensity);
5182                                         if (y > 0)
5183                                         {
5184                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5185                                                 if (y > texture->mipmaps - 1)
5186                                                         y = texture->mipmaps - 1;
5187                                                 triangle->mip[texunit] = y;
5188                                         }
5189                                 }
5190                         }
5191                 }
5192         
5193                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5194                 for (; y < bandy;)
5195                 {
5196                         __m128 xcoords, xslope;
5197                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5198                         int yccmask = _mm_movemask_epi8(ycc);
5199                         int edge0p, edge0n, edge1p, edge1n;
5200                         int nexty;
5201                         float w, wslope;
5202                         float clip0;
5203                         if (numpoints == 4)
5204                         {
5205                                 switch(yccmask)
5206                                 {
5207                                 default:
5208                                 case 0xFFFF: /*0000*/ y = endy; continue;
5209                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5210                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5211                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5212                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5213                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5214                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5215                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5216                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5217                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5218                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5219                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5220                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5221                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5222                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5223                                 case 0x0000: /*1111*/ y++; continue;
5224                                 }
5225                         }
5226                         else
5227                         {
5228                                 switch(yccmask)
5229                                 {
5230                                 default:
5231                                 case 0xFFFF: /*000*/ y = endy; continue;
5232                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5233                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5234                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5235                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5236                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5237                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5238                                 case 0x0000: /*111*/ y++; continue;
5239                                 }
5240                         }
5241                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5242                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5243                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5244                         nexty = _mm_extract_epi16(ycc, 0);
5245                         if (nexty >= bandy) nexty = bandy-1;
5246                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5247                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5248                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5249                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5250                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5251                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5252                         {
5253                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5254                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5255                         }
5256                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5257                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5258                         {
5259                                 int startx, endx, offset;
5260                                 startx = _mm_cvtss_si32(xcoords);
5261                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5262                                 if (startx < minx) startx = minx;
5263                                 if (endx > maxx) endx = maxx;
5264                                 if (startx >= endx) continue;
5265
5266                                 if (clip0dir)
5267                                 {
5268                                         if (clip0dir > 0)
5269                                         {
5270                                                 if (startx < clip0) 
5271                                                 {
5272                                                         if(endx <= clip0) continue;
5273                                                         startx = (int)clip0;
5274                                                 }
5275                                         }
5276                                         else if (endx > clip0) 
5277                                         {
5278                                                 if(startx >= clip0) continue;
5279                                                 endx = (int)clip0;
5280                                         }
5281                                 }
5282                                                 
5283                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5284                                 {
5285                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5286                                         span->triangle = thread->numtriangles;
5287                                         span->x = offset;
5288                                         span->y = y;
5289                                         span->startx = 0;
5290                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5291                                         if (span->startx >= span->endx)
5292                                                 continue;
5293                                         wslope = triangle->w[0];
5294                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5295                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5296                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5297                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5298                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5299                                 }
5300                         }
5301                 }
5302
5303                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5304                 {
5305                         DPSOFTRAST_Draw_ProcessSpans(thread);
5306                         thread->numtriangles = 0;
5307                 }
5308         }
5309
5310         if (!ATOMIC_DECREMENT(command->refcount))
5311         {
5312                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5313                         MM_FREE(command->arrays);
5314         }
5315
5316         if (thread->numspans > 0 || thread->numtriangles > 0)
5317         {
5318                 DPSOFTRAST_Draw_ProcessSpans(thread);
5319                 thread->numtriangles = 0;
5320         }
5321 #endif
5322 }
5323
5324 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5325 {
5326         int i;
5327         int j;
5328         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5329         int datasize = 2*numvertices*sizeof(float[4]);
5330         DPSOFTRAST_Command_Draw *command;
5331         unsigned char *data;
5332         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5333         {
5334                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5335                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5336                         break;
5337                 datasize += numvertices*sizeof(float[4]);
5338         }
5339         if (element3s)
5340                 datasize += numtriangles*sizeof(unsigned short[3]);
5341         else if (element3i)
5342                 datasize += numtriangles*sizeof(int[3]);
5343         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5344         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5345         {
5346                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5347                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5348         }
5349         else
5350         {
5351                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5352                 data = (unsigned char *)command + commandsize;
5353         }
5354         command->firstvertex = firstvertex;
5355         command->numvertices = numvertices;
5356         command->numtriangles = numtriangles;
5357         command->arrays = (float *)data;
5358         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5359         dpsoftrast.firstvertex = firstvertex;
5360         dpsoftrast.numvertices = numvertices;
5361         dpsoftrast.screencoord4f = (float *)data;
5362         data += numvertices*sizeof(float[4]);
5363         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5364         data += numvertices*sizeof(float[4]);
5365         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5366         {
5367                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5368                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5369                         break;
5370                 dpsoftrast.post_array4f[j] = (float *)data;
5371                 data += numvertices*sizeof(float[4]);
5372         }
5373         command->element3i = NULL;
5374         command->element3s = NULL;
5375         if (element3s)
5376         {
5377                 command->element3s = (unsigned short *)data;
5378                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5379         }
5380         else if (element3i)
5381         {
5382                 command->element3i = (int *)data;
5383                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5384         }
5385         return command;
5386 }
5387
5388 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5389 {
5390         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5391         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5392         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5393         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5394         if (command->starty >= command->endy)
5395         {
5396                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5397                         MM_FREE(command->arrays);
5398                 DPSOFTRAST_UndoCommand(command->commandsize);
5399                 return;
5400         }
5401         command->clipped = dpsoftrast.drawclipped;
5402         command->refcount = dpsoftrast.numthreads;
5403
5404         if (dpsoftrast.usethreads)
5405         {
5406                 int i;
5407                 DPSOFTRAST_Draw_SyncCommands();
5408                 for (i = 0; i < dpsoftrast.numthreads; i++)
5409                 {
5410                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5411                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5412                                 Thread_CondSignal(thread->drawcond);
5413                 }
5414         }
5415         else
5416         {
5417                 DPSOFTRAST_Draw_FlushThreads();
5418         }
5419 }
5420
5421 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5422 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5423 {
5424         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5425 }
5426 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5427 {
5428         DPSOFTRAST_Command_SetRenderTargets *command;
5429         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5430                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5431                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5432                 DPSOFTRAST_Flush();
5433         dpsoftrast.fb_width = width;
5434         dpsoftrast.fb_height = height;
5435         dpsoftrast.fb_depthpixels = depthpixels;
5436         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5437         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5438         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5439         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5440         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5441         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5442         command->width = width;
5443         command->height = height;
5444 }
5445  
5446 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5447 {
5448         int commandoffset = thread->commandoffset;
5449         while (commandoffset != endoffset)
5450         {
5451                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5452                 switch (command->opcode)
5453                 {
5454 #define INTERPCOMMAND(name) \
5455                 case DPSOFTRAST_OPCODE_##name : \
5456                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5457                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5458                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5459                                 commandoffset = 0; \
5460                         break;
5461                 INTERPCOMMAND(Viewport)
5462                 INTERPCOMMAND(ClearColor)
5463                 INTERPCOMMAND(ClearDepth)
5464                 INTERPCOMMAND(ColorMask)
5465                 INTERPCOMMAND(DepthTest)
5466                 INTERPCOMMAND(ScissorTest)
5467                 INTERPCOMMAND(Scissor)
5468                 INTERPCOMMAND(BlendFunc)
5469                 INTERPCOMMAND(BlendSubtract)
5470                 INTERPCOMMAND(DepthMask)
5471                 INTERPCOMMAND(DepthFunc)
5472                 INTERPCOMMAND(DepthRange)
5473                 INTERPCOMMAND(PolygonOffset)
5474                 INTERPCOMMAND(CullFace)
5475                 INTERPCOMMAND(AlphaTest)
5476                 INTERPCOMMAND(AlphaFunc)
5477                 INTERPCOMMAND(SetTexture)
5478                 INTERPCOMMAND(SetShader)
5479                 INTERPCOMMAND(Uniform4f)
5480                 INTERPCOMMAND(UniformMatrix4f)
5481                 INTERPCOMMAND(Uniform1i)
5482                 INTERPCOMMAND(SetRenderTargets)
5483                 INTERPCOMMAND(ClipPlane)
5484
5485                 case DPSOFTRAST_OPCODE_Draw:
5486                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5487                         commandoffset += command->commandsize;
5488                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5489                                 commandoffset = 0;
5490                         thread->commandoffset = commandoffset;
5491                         break;
5492
5493                 case DPSOFTRAST_OPCODE_Reset:
5494                         commandoffset = 0;
5495                         break;
5496                 }
5497         }
5498         thread->commandoffset = commandoffset;
5499 }
5500
5501 static int DPSOFTRAST_Draw_Thread(void *data)
5502 {
5503         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5504         while(thread->index >= 0)
5505         {
5506                 if (thread->commandoffset != dpsoftrast.drawcommand)
5507                 {
5508                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5509                 }
5510                 else 
5511                 {
5512                         Thread_LockMutex(thread->drawmutex);
5513                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5514                         {
5515                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5516                                 thread->starving = true;
5517                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5518                                 thread->starving = false;
5519                         }
5520                         Thread_UnlockMutex(thread->drawmutex);
5521                 }
5522         }   
5523         return 0;
5524 }
5525
5526 static void DPSOFTRAST_Draw_FlushThreads(void)
5527 {
5528         DPSOFTRAST_State_Thread *thread;
5529         int i;
5530         DPSOFTRAST_Draw_SyncCommands();
5531         if (dpsoftrast.usethreads) 
5532         {
5533                 for (i = 0; i < dpsoftrast.numthreads; i++)
5534                 {
5535                         thread = &dpsoftrast.threads[i];
5536                         if (thread->commandoffset != dpsoftrast.drawcommand)
5537                         {
5538                                 Thread_LockMutex(thread->drawmutex);
5539                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5540                                         Thread_CondSignal(thread->drawcond);
5541                                 Thread_UnlockMutex(thread->drawmutex);
5542                         }
5543                 }
5544                 for (i = 0; i < dpsoftrast.numthreads; i++)
5545                 {
5546                         thread = &dpsoftrast.threads[i];
5547                         if (thread->commandoffset != dpsoftrast.drawcommand)
5548                         {
5549                                 Thread_LockMutex(thread->drawmutex);
5550                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5551                                 {
5552                                         thread->waiting = true;
5553                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5554                                         thread->waiting = false;
5555                                 }
5556                                 Thread_UnlockMutex(thread->drawmutex);
5557                         }
5558                 }
5559         }
5560         else
5561         {
5562                 for (i = 0; i < dpsoftrast.numthreads; i++)
5563                 {
5564                         thread = &dpsoftrast.threads[i];
5565                         if (thread->commandoffset != dpsoftrast.drawcommand)
5566                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5567                 }
5568         }
5569         dpsoftrast.commandpool.usedcommands = 0;
5570 }
5571
5572 void DPSOFTRAST_Flush(void)
5573 {
5574         DPSOFTRAST_Draw_FlushThreads();
5575 }
5576
5577 void DPSOFTRAST_Finish(void)
5578 {
5579         DPSOFTRAST_Flush();
5580 }
5581
5582 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5583 {
5584         int i;
5585         union
5586         {
5587                 int i;
5588                 unsigned char b[4];
5589         }
5590         u;
5591         u.i = 1;
5592         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5593         dpsoftrast.bigendian = u.b[3];
5594         dpsoftrast.fb_width = width;
5595         dpsoftrast.fb_height = height;
5596         dpsoftrast.fb_depthpixels = depthpixels;
5597         dpsoftrast.fb_colorpixels[0] = colorpixels;
5598         dpsoftrast.fb_colorpixels[1] = NULL;
5599         dpsoftrast.fb_colorpixels[1] = NULL;
5600         dpsoftrast.fb_colorpixels[1] = NULL;
5601         dpsoftrast.viewport[0] = 0;
5602         dpsoftrast.viewport[1] = 0;
5603         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5604         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5605         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5606         dpsoftrast.texture_firstfree = 1;
5607         dpsoftrast.texture_end = 1;
5608         dpsoftrast.texture_max = 0;
5609         dpsoftrast.color[0] = 1;
5610         dpsoftrast.color[1] = 1;
5611         dpsoftrast.color[2] = 1;
5612         dpsoftrast.color[3] = 1;
5613         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5614         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5615         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5616         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5617         for (i = 0; i < dpsoftrast.numthreads; i++)
5618         {
5619                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5620                 thread->index = i;
5621                 thread->cullface = GL_BACK;
5622         thread->colormask[0] = 1; 
5623                 thread->colormask[1] = 1;
5624                 thread->colormask[2] = 1;
5625                 thread->colormask[3] = 1;
5626                 thread->blendfunc[0] = GL_ONE;
5627                 thread->blendfunc[1] = GL_ZERO;
5628                 thread->depthmask = true;
5629                 thread->depthtest = true;
5630                 thread->depthfunc = GL_LEQUAL;
5631                 thread->scissortest = false;
5632                 thread->alphatest = false;
5633                 thread->alphafunc = GL_GREATER;
5634                 thread->alphavalue = 0.5f;
5635                 thread->viewport[0] = 0;
5636                 thread->viewport[1] = 0;
5637                 thread->viewport[2] = dpsoftrast.fb_width;
5638                 thread->viewport[3] = dpsoftrast.fb_height;
5639                 thread->scissor[0] = 0;
5640                 thread->scissor[1] = 0;
5641                 thread->scissor[2] = dpsoftrast.fb_width;
5642                 thread->scissor[3] = dpsoftrast.fb_height;
5643                 thread->depthrange[0] = 0;
5644                 thread->depthrange[1] = 1;
5645                 thread->polygonoffset[0] = 0;
5646                 thread->polygonoffset[1] = 0;
5647                 thread->clipplane[0] = 0;
5648                 thread->clipplane[1] = 0;
5649                 thread->clipplane[2] = 0;
5650                 thread->clipplane[3] = 1;
5651         
5652                 thread->numspans = 0;
5653                 thread->numtriangles = 0;
5654                 thread->commandoffset = 0;
5655                 thread->waiting = false;
5656                 thread->starving = false;
5657            
5658                 thread->validate = -1;
5659                 DPSOFTRAST_Validate(thread, -1);
5660  
5661                 if (dpsoftrast.usethreads)
5662                 {
5663                         thread->waitcond = Thread_CreateCond();
5664                         thread->drawcond = Thread_CreateCond();
5665                         thread->drawmutex = Thread_CreateMutex();
5666                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5667                 }
5668         }
5669         return 0;
5670 }
5671
5672 void DPSOFTRAST_Shutdown(void)
5673 {
5674         int i;
5675         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5676         {
5677                 DPSOFTRAST_State_Thread *thread;
5678                 for (i = 0; i < dpsoftrast.numthreads; i++)
5679                 {
5680                         thread = &dpsoftrast.threads[i];
5681                         Thread_LockMutex(thread->drawmutex);
5682                         thread->index = -1;
5683                         Thread_CondSignal(thread->drawcond);
5684                         Thread_UnlockMutex(thread->drawmutex);
5685                         Thread_WaitThread(thread->thread, 0);
5686                         Thread_DestroyCond(thread->waitcond);
5687                         Thread_DestroyCond(thread->drawcond);
5688                         Thread_DestroyMutex(thread->drawmutex);
5689                 }
5690         }
5691         for (i = 0;i < dpsoftrast.texture_end;i++)
5692                 if (dpsoftrast.texture[i].bytes)
5693                         MM_FREE(dpsoftrast.texture[i].bytes);
5694         if (dpsoftrast.texture)
5695                 free(dpsoftrast.texture);
5696         if (dpsoftrast.threads)
5697                 MM_FREE(dpsoftrast.threads);
5698         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5699 }
5700