]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
fix some clipping plane bugs and FinishBGRA8 bug introduced in r11015
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 32
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile int
36                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
37                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
38                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
39         #elif defined(_MSC_VER)
40                 #define ALIGN(var) __declspec(align(16)) var
41                 #define ATOMIC(var) __declspec(align(32)) var
42                 #define MEMORY_BARRIER (_mm_sfence())
43                 //(MemoryBarrier())
44                 #define ATOMIC_COUNTER volatile LONG
45                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
46                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
47                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
48         #endif
49 #endif
50
51 #ifndef ALIGN
52 #define ALIGN(var) var
53 #endif
54 #ifndef ATOMIC
55 #define ATOMIC(var) var
56 #endif
57 #ifndef MEMORY_BARRIER
58 #define MEMORY_BARRIER ((void)0)
59 #endif
60 #ifndef ATOMIC_COUNTER
61 #define ATOMIC_COUNTER int
62 #endif
63 #ifndef ATOMIC_INCREMENT
64 #define ATOMIC_INCREMENT(counter) (++(counter))
65 #endif
66 #ifndef ATOMIC_DECREMENT
67 #define ATOMIC_DECREMENT(counter) (--(counter))
68 #endif
69 #ifndef ATOMIC_ADD
70 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
71 #endif
72
73 #ifdef SSE_POSSIBLE
74 #include <emmintrin.h>
75
76 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
77
78 static void *MM_CALLOC(size_t nmemb, size_t size)
79 {
80         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
81         if (ptr != NULL) memset(ptr, 0, nmemb*size);
82         return ptr;
83 }
84
85 #define MM_FREE _mm_free
86 #else
87 #define MM_MALLOC(size) malloc(size)
88 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
89 #define MM_FREE free
90 #endif
91
92 typedef enum DPSOFTRAST_ARRAY_e
93 {
94         DPSOFTRAST_ARRAY_POSITION,
95         DPSOFTRAST_ARRAY_COLOR,
96         DPSOFTRAST_ARRAY_TEXCOORD0,
97         DPSOFTRAST_ARRAY_TEXCOORD1,
98         DPSOFTRAST_ARRAY_TEXCOORD2,
99         DPSOFTRAST_ARRAY_TEXCOORD3,
100         DPSOFTRAST_ARRAY_TEXCOORD4,
101         DPSOFTRAST_ARRAY_TEXCOORD5,
102         DPSOFTRAST_ARRAY_TEXCOORD6,
103         DPSOFTRAST_ARRAY_TEXCOORD7,
104         DPSOFTRAST_ARRAY_TOTAL
105 }
106 DPSOFTRAST_ARRAY;
107
108 typedef struct DPSOFTRAST_Texture_s
109 {
110         int flags;
111         int width;
112         int height;
113         int depth;
114         int sides;
115         DPSOFTRAST_TEXTURE_FILTER filter;
116         int mipmaps;
117         int size;
118         ATOMIC_COUNTER binds;
119         unsigned char *bytes;
120         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
121 }
122 DPSOFTRAST_Texture;
123
124 #define COMMAND_SIZE ALIGN_SIZE
125 #define COMMAND_ALIGN(var) ALIGN(var)
126
127 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
128 {
129         unsigned char opcode;
130         unsigned short commandsize;
131 }
132 DPSOFTRAST_Command);
133
134 enum { DPSOFTRAST_OPCODE_Reset = 0 };
135
136 #define DEFCOMMAND(opcodeval, name, fields) \
137         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
138         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
139         { \
140                 unsigned char opcode; \
141                 unsigned short commandsize; \
142                 fields \
143         } DPSOFTRAST_Command_##name );
144
145 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
146 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
147
148 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
149 {
150         int freecommand;
151         int usedcommands;
152         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
153 }
154 DPSOFTRAST_State_Command_Pool);
155
156 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
157 {
158         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
159         float w[3];
160         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
161 }
162 DPSOFTRAST_State_Triangle);
163
164 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
165         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
166         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
167                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
168                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
169 }
170 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
171         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
172         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
173         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
174         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
175         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
176         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
177         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
178         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
179 }
180                                         
181 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
182
183 typedef ALIGN(struct DPSOFTRAST_State_Span_s
184 {
185         int triangle; // triangle this span was generated by
186         int x; // framebuffer x coord
187         int y; // framebuffer y coord
188         int startx; // usable range (according to pixelmask)
189         int endx; // usable range (according to pixelmask)
190         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
191         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
192         int depthslope; // depthbuffer value pixel delta
193 }
194 DPSOFTRAST_State_Span);
195
196 #define DPSOFTRAST_DRAW_MAXSPANS 1024
197 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
198 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
199
200 #define DPSOFTRAST_VALIDATE_FB 1
201 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
202 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
203 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
204
205 typedef enum DPSOFTRAST_BLENDMODE_e
206 {
207         DPSOFTRAST_BLENDMODE_OPAQUE,
208         DPSOFTRAST_BLENDMODE_ALPHA,
209         DPSOFTRAST_BLENDMODE_ADDALPHA,
210         DPSOFTRAST_BLENDMODE_ADD,
211         DPSOFTRAST_BLENDMODE_INVMOD,
212         DPSOFTRAST_BLENDMODE_MUL,
213         DPSOFTRAST_BLENDMODE_MUL2,
214         DPSOFTRAST_BLENDMODE_SUBALPHA,
215         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
216         DPSOFTRAST_BLENDMODE_INVADD,
217         DPSOFTRAST_BLENDMODE_TOTAL
218 }
219 DPSOFTRAST_BLENDMODE;
220
221 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
222 {
223         void *thread;
224         int index;
225         
226         int cullface;
227         int colormask[4];
228         int blendfunc[2];
229         int blendsubtract;
230         int depthmask;
231         int depthtest;
232         int depthfunc;
233         int scissortest;
234         int alphatest;
235         int alphafunc;
236         float alphavalue;
237         int viewport[4];
238         int scissor[4];
239         float depthrange[2];
240         float polygonoffset[2];
241         float clipplane[4];
242         ALIGN(float fb_clipplane[4]);
243
244         int shader_mode;
245         int shader_permutation;
246         int shader_exactspecularmath;
247
248         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
249         
250         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
251         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
252
253         // DPSOFTRAST_VALIDATE_ flags
254         int validate;
255
256         // derived values (DPSOFTRAST_VALIDATE_FB)
257         int fb_colormask;
258         int fb_scissor[4];
259         ALIGN(float fb_viewportcenter[4]);
260         ALIGN(float fb_viewportscale[4]);
261
262         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
263         int fb_depthfunc;
264
265         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
266         int fb_blendmode;
267
268         // band boundaries
269         int miny1;
270         int maxy1;
271         int miny2;
272         int maxy2;
273
274         ATOMIC(volatile int commandoffset);
275
276         volatile bool waiting;
277         volatile bool starving;
278         void *waitcond;
279         void *drawcond;
280         void *drawmutex;
281
282         int numspans;
283         int numtriangles;
284         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
285         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
286         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
287 }
288 DPSOFTRAST_State_Thread);
289
290 typedef ATOMIC(struct DPSOFTRAST_State_s
291 {
292         int fb_width;
293         int fb_height;
294         unsigned int *fb_depthpixels;
295         unsigned int *fb_colorpixels[4];
296
297         int viewport[4];
298         ALIGN(float fb_viewportcenter[4]);
299         ALIGN(float fb_viewportscale[4]);
300
301         float color[4];
302         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
303         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
304
305         const float *pointer_vertex3f;
306         const float *pointer_color4f;
307         const unsigned char *pointer_color4ub;
308         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
309         int stride_vertex;
310         int stride_color;
311         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
312         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
313         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
314
315         int firstvertex;
316         int numvertices;
317         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
318         float *screencoord4f;
319         int drawstarty;
320         int drawendy;
321         int drawclipped;
322         
323         int shader_mode;
324         int shader_permutation;
325         int shader_exactspecularmath;
326
327         int texture_max;
328         int texture_end;
329         int texture_firstfree;
330         DPSOFTRAST_Texture *texture;
331
332         int bigendian;
333
334         // error reporting
335         const char *errorstring;
336
337         bool usethreads;
338         int interlace;
339         int numthreads;
340         DPSOFTRAST_State_Thread *threads;
341
342         ATOMIC(volatile int drawcommand);
343
344         DPSOFTRAST_State_Command_Pool commandpool;
345 }
346 DPSOFTRAST_State);
347
348 DPSOFTRAST_State dpsoftrast;
349
350 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
351 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
352 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
353 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
354
355 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
356 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
357
358 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
359 {
360         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
361         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
362         fb_viewportcenter[3] = 0.5f;
363         fb_viewportcenter[0] = 0.0f;
364         fb_viewportscale[1] = 0.5f * viewport[2];
365         fb_viewportscale[2] = -0.5f * viewport[3];
366         fb_viewportscale[3] = 0.5f;
367         fb_viewportscale[0] = 1.0f;
368 }
369
370 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
371 {
372         if (dpsoftrast.interlace)
373         {
374                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
375                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
376                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
377                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
378         }
379         else
380         {
381                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
382                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
383         }
384 }
385
386 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
387 {
388         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
389         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
390         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
391         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
392         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
393 }
394
395 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
396 {
397         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
398         // and viewport projection values
399         int x1, x2;
400         int y1, y2;
401         x1 = thread->scissor[0];
402         x2 = thread->scissor[0] + thread->scissor[2];
403         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
404         y2 = dpsoftrast.fb_height - thread->scissor[1];
405         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
406         if (x1 < 0) x1 = 0;
407         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
408         if (y1 < 0) y1 = 0;
409         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
410         thread->fb_scissor[0] = x1;
411         thread->fb_scissor[1] = y1;
412         thread->fb_scissor[2] = x2 - x1;
413         thread->fb_scissor[3] = y2 - y1;
414
415         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
416         DPSOFTRAST_RecalcClipPlane(thread);
417         DPSOFTRAST_RecalcThread(thread);
418 }
419
420 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
421 {
422         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
423 }
424
425 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
426 {
427         if (thread->blendsubtract)
428         {
429                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
430                 {
431                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
432                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
433                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
434                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
435                 }
436         }
437         else
438         {       
439                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
440                 {
441                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
442                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
443                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
444                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
445                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
446                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
447                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
448                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
449                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
450                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
451                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
452                 }
453         }
454 }
455
456 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
457
458 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
459 {
460         mask &= thread->validate;
461         if (!mask)
462                 return;
463         if (mask & DPSOFTRAST_VALIDATE_FB)
464         {
465                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
466                 DPSOFTRAST_RecalcFB(thread);
467         }
468         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
469         {
470                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
471                 DPSOFTRAST_RecalcDepthFunc(thread);
472         }
473         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
474         {
475                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
476                 DPSOFTRAST_RecalcBlendFunc(thread);
477         }
478 }
479
480 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
481 {
482         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
483                 return &dpsoftrast.texture[index];
484         return NULL;
485 }
486
487 static void DPSOFTRAST_Texture_Grow(void)
488 {
489         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
490         DPSOFTRAST_State_Thread *thread;
491         int i;
492         int j;
493         DPSOFTRAST_Flush();
494         // expand texture array as needed
495         if (dpsoftrast.texture_max < 1024)
496                 dpsoftrast.texture_max = 1024;
497         else
498                 dpsoftrast.texture_max *= 2;
499         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
500         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
501                 if (dpsoftrast.texbound[i])
502                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
503         for (j = 0; j < dpsoftrast.numthreads; j++)
504         {
505                 thread = &dpsoftrast.threads[j];
506                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
507                         if (thread->texbound[i])
508                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
509         }
510 }
511
512 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
513 {
514         int w;
515         int h;
516         int d;
517         int size;
518         int s;
519         int texnum;
520         int mipmaps;
521         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
522         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
523         DPSOFTRAST_Texture *texture;
524         if (width*height*depth < 1)
525         {
526                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
527                 return 0;
528         }
529         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
530         {
531                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
532                 return 0;
533         }
534         switch(texformat)
535         {
536         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
537         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
538         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
539                 break;
540         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
541                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
542                 {
543                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
544                         return 0;
545                 }
546                 if (depth != 1)
547                 {
548                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
549                         return 0;
550                 }
551                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
552                 {
553                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
554                         return 0;
555                 }
556                 break;
557         }
558         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
559         {
560                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
561                 return 0;
562         }
563         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
564         {
565                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
566                 return 0;
567         }
568         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
569         {
570                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
571                 return 0;
572         }
573         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
576                 return 0;
577         }
578         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
581                 return 0;
582         }
583         // find first empty slot in texture array
584         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
585                 if (!dpsoftrast.texture[texnum].bytes)
586                         break;
587         dpsoftrast.texture_firstfree = texnum + 1;
588         if (dpsoftrast.texture_max <= texnum)
589                 DPSOFTRAST_Texture_Grow();
590         if (dpsoftrast.texture_end <= texnum)
591                 dpsoftrast.texture_end = texnum + 1;
592         texture = &dpsoftrast.texture[texnum];
593         memset(texture, 0, sizeof(*texture));
594         texture->flags = flags;
595         texture->width = width;
596         texture->height = height;
597         texture->depth = depth;
598         texture->sides = sides;
599         texture->binds = 0;
600         w = width;
601         h = height;
602         d = depth;
603         size = 0;
604         mipmaps = 0;
605         w = width;
606         h = height;
607         d = depth;
608         for (;;)
609         {
610                 s = w * h * d * sides * 4;
611                 texture->mipmap[mipmaps][0] = size;
612                 texture->mipmap[mipmaps][1] = s;
613                 texture->mipmap[mipmaps][2] = w;
614                 texture->mipmap[mipmaps][3] = h;
615                 texture->mipmap[mipmaps][4] = d;
616                 size += s;
617                 mipmaps++;
618                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
619                         break;
620                 if (w > 1) w >>= 1;
621                 if (h > 1) h >>= 1;
622                 if (d > 1) d >>= 1;
623         }
624         texture->mipmaps = mipmaps;
625         texture->size = size;
626
627         // allocate the pixels now
628         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
629
630         return texnum;
631 }
632 void DPSOFTRAST_Texture_Free(int index)
633 {
634         DPSOFTRAST_Texture *texture;
635         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
636         if (texture->binds)
637                 DPSOFTRAST_Flush();
638         if (texture->bytes)
639                 MM_FREE(texture->bytes);
640         texture->bytes = NULL;
641         memset(texture, 0, sizeof(*texture));
642         // adjust the free range and used range
643         if (dpsoftrast.texture_firstfree > index)
644                 dpsoftrast.texture_firstfree = index;
645         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
646                 dpsoftrast.texture_end--;
647 }
648 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
649 {
650         int i, x, y, z, w, layer0, layer1, row0, row1;
651         unsigned char *o, *i0, *i1, *i2, *i3;
652         DPSOFTRAST_Texture *texture;
653         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
654         if (texture->mipmaps <= 1)
655                 return;
656         for (i = 1;i < texture->mipmaps;i++)
657         {
658                 for (z = 0;z < texture->mipmap[i][4];z++)
659                 {
660                         layer0 = z*2;
661                         layer1 = z*2+1;
662                         if (layer1 >= texture->mipmap[i-1][4])
663                                 layer1 = texture->mipmap[i-1][4]-1;
664                         for (y = 0;y < texture->mipmap[i][3];y++)
665                         {
666                                 row0 = y*2;
667                                 row1 = y*2+1;
668                                 if (row1 >= texture->mipmap[i-1][3])
669                                         row1 = texture->mipmap[i-1][3]-1;
670                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
671                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
672                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
673                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
674                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
675                                 w = texture->mipmap[i][2];
676                                 if (layer1 > layer0)
677                                 {
678                                         if (texture->mipmap[i-1][2] > 1)
679                                         {
680                                                 // average 3D texture
681                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
682                                                 {
683                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
684                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
685                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
686                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
687                                                 }
688                                         }
689                                         else
690                                         {
691                                                 // average 3D mipmap with parent width == 1
692                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
693                                                 {
694                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
695                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
696                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
697                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
698                                                 }
699                                         }
700                                 }
701                                 else
702                                 {
703                                         if (texture->mipmap[i-1][2] > 1)
704                                         {
705                                                 // average 2D texture (common case)
706                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
707                                                 {
708                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
709                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
710                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
711                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
712                                                 }
713                                         }
714                                         else
715                                         {
716                                                 // 2D texture with parent width == 1
717                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
718                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
719                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
720                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
721                                         }
722                                 }
723                         }
724                 }
725         }
726 }
727 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
728 {
729         DPSOFTRAST_Texture *texture;
730         unsigned char *dst;
731         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
732         if (texture->binds)
733                 DPSOFTRAST_Flush();
734         if (pixels)
735         {
736                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
737                 while (blockheight > 0)
738                 {
739                         memcpy(dst, pixels, blockwidth * 4);
740                         pixels += blockwidth * 4;
741                         dst += texture->mipmap[0][2] * 4;
742                         blockheight--;
743                 }
744         }
745         DPSOFTRAST_Texture_CalculateMipmaps(index);
746 }
747 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
748 {
749         DPSOFTRAST_Texture *texture;
750         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
751         if (texture->binds)
752                 DPSOFTRAST_Flush();
753         if (pixels)
754                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
755         DPSOFTRAST_Texture_CalculateMipmaps(index);
756 }
757 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
758 {
759         DPSOFTRAST_Texture *texture;
760         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
761         return texture->mipmap[mip][2];
762 }
763 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
764 {
765         DPSOFTRAST_Texture *texture;
766         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
767         return texture->mipmap[mip][3];
768 }
769 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
770 {
771         DPSOFTRAST_Texture *texture;
772         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
773         return texture->mipmap[mip][4];
774 }
775 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
776 {
777         DPSOFTRAST_Texture *texture;
778         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
779         if (texture->binds)
780                 DPSOFTRAST_Flush();
781         return texture->bytes + texture->mipmap[mip][0];
782 }
783 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
784 {
785         DPSOFTRAST_Texture *texture;
786         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
787         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
788         {
789                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
790                 return;
791         }
792         if (texture->binds)
793                 DPSOFTRAST_Flush();
794         texture->filter = filter;
795 }
796
797 static void DPSOFTRAST_Draw_FlushThreads(void);
798
799 static void DPSOFTRAST_Draw_SyncCommands(void)
800 {
801         if(dpsoftrast.usethreads) MEMORY_BARRIER;
802         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
803 }
804
805 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
806 {
807         DPSOFTRAST_State_Thread *thread;
808         int i;
809         int freecommand = dpsoftrast.commandpool.freecommand;
810         int usedcommands = dpsoftrast.commandpool.usedcommands;
811         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
812                 return;
813         DPSOFTRAST_Draw_SyncCommands();
814         for(;;)
815         {
816                 int waitindex = -1;
817                 int commandoffset;
818                 usedcommands = 0;
819                 for (i = 0; i < dpsoftrast.numthreads; i++)
820                 {
821                         thread = &dpsoftrast.threads[i]; 
822                         commandoffset = freecommand - thread->commandoffset;
823                         if (commandoffset < 0)
824                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
825                         if (commandoffset > usedcommands)
826                         {
827                                 waitindex = i;
828                                 usedcommands = commandoffset;
829                         }
830                 }
831                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
832                         break;
833                 thread = &dpsoftrast.threads[waitindex];
834                 Thread_LockMutex(thread->drawmutex);
835                 if (thread->commandoffset != dpsoftrast.drawcommand)
836                 {
837                         thread->waiting = true;
838                         if (thread->starving) Thread_CondSignal(thread->drawcond);
839                         Thread_CondWait(thread->waitcond, thread->drawmutex);
840                         thread->waiting = false;
841                 }
842                 Thread_UnlockMutex(thread->drawmutex);
843         }
844         dpsoftrast.commandpool.usedcommands = usedcommands;
845 }
846
847 #define DPSOFTRAST_ALIGNCOMMAND(size) \
848         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
849 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
850         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
851
852 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
853 {
854         DPSOFTRAST_Command *command;
855         int freecommand = dpsoftrast.commandpool.freecommand;
856         int usedcommands = dpsoftrast.commandpool.usedcommands;
857         int extra = sizeof(DPSOFTRAST_Command);
858         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
859                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
860         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
861         {
862                 if (dpsoftrast.usethreads)
863                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
864                 else
865                         DPSOFTRAST_Draw_FlushThreads();
866                 freecommand = dpsoftrast.commandpool.freecommand;
867                 usedcommands = dpsoftrast.commandpool.usedcommands;
868         }
869         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
870         {
871                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
872                 command->opcode = DPSOFTRAST_OPCODE_Reset;
873                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
874                 freecommand = 0;
875         }
876         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
877         command->opcode = opcode;
878         command->commandsize = size;
879         freecommand += size;
880         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
881                 freecommand = 0;
882         dpsoftrast.commandpool.freecommand = freecommand;
883         dpsoftrast.commandpool.usedcommands = usedcommands + size;
884         return command;
885 }
886
887 static void DPSOFTRAST_UndoCommand(int size)
888 {
889         int freecommand = dpsoftrast.commandpool.freecommand;
890         int usedcommands = dpsoftrast.commandpool.usedcommands;
891         freecommand -= size;
892         if (freecommand < 0)
893                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
894         usedcommands -= size;
895         dpsoftrast.commandpool.freecommand = freecommand;
896         dpsoftrast.commandpool.usedcommands = usedcommands;
897 }
898                 
899 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
900 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
901 {
902         thread->viewport[0] = command->x;
903         thread->viewport[1] = command->y;
904         thread->viewport[2] = command->width;
905         thread->viewport[3] = command->height;
906         thread->validate |= DPSOFTRAST_VALIDATE_FB;
907 }
908 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
909 {
910         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
911         command->x = x;
912         command->y = y;
913         command->width = width;
914         command->height = height;
915
916         dpsoftrast.viewport[0] = x;
917         dpsoftrast.viewport[1] = y;
918         dpsoftrast.viewport[2] = width;
919         dpsoftrast.viewport[3] = height;
920         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
921 }
922
923 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
924 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
925 {
926         int i, x1, y1, x2, y2, w, h, x, y;
927         int miny1, maxy1, miny2, maxy2;
928         int bandy;
929         unsigned int *p;
930         unsigned int c;
931         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
932         miny1 = thread->miny1;
933         maxy1 = thread->maxy1;
934         miny2 = thread->miny2;
935         maxy2 = thread->maxy2;
936         x1 = thread->fb_scissor[0];
937         y1 = thread->fb_scissor[1];
938         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
939         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
940         if (y1 < miny1) y1 = miny1;
941         if (y2 > maxy2) y2 = maxy2;
942         w = x2 - x1;
943         h = y2 - y1;
944         if (w < 1 || h < 1)
945                 return;
946         // FIXME: honor fb_colormask?
947         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
948         for (i = 0;i < 4;i++)
949         {
950                 if (!dpsoftrast.fb_colorpixels[i])
951                         continue;
952                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
953                 for (;y < bandy;y++)
954                 {
955                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
956                         for (x = x1;x < x2;x++)
957                                 p[x] = c;
958                 }
959         }
960 }
961 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
962 {
963         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
964         command->r = r;
965         command->g = g;
966         command->b = b;
967         command->a = a;
968 }
969
970 DEFCOMMAND(3, ClearDepth, float depth;)
971 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
972 {
973         int x1, y1, x2, y2, w, h, x, y;
974         int miny1, maxy1, miny2, maxy2;
975         int bandy;
976         unsigned int *p;
977         unsigned int c;
978         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
979         miny1 = thread->miny1;
980         maxy1 = thread->maxy1;
981         miny2 = thread->miny2;
982         maxy2 = thread->maxy2;
983         x1 = thread->fb_scissor[0];
984         y1 = thread->fb_scissor[1];
985         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
986         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
987         if (y1 < miny1) y1 = miny1;
988         if (y2 > maxy2) y2 = maxy2;
989         w = x2 - x1;
990         h = y2 - y1;
991         if (w < 1 || h < 1)
992                 return;
993         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
994         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
995         for (;y < bandy;y++)
996         {
997                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
998                 for (x = x1;x < x2;x++)
999                         p[x] = c;
1000         }
1001 }
1002 void DPSOFTRAST_ClearDepth(float d)
1003 {
1004         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1005         command->depth = d;
1006 }
1007
1008 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1009 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1010 {
1011         thread->colormask[0] = command->r != 0;
1012         thread->colormask[1] = command->g != 0;
1013         thread->colormask[2] = command->b != 0;
1014         thread->colormask[3] = command->a != 0;
1015         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1016 }
1017 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1018 {
1019         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1020         command->r = r;
1021         command->g = g;
1022         command->b = b;
1023         command->a = a;
1024 }
1025
1026 DEFCOMMAND(5, DepthTest, int enable;)
1027 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1028 {
1029         thread->depthtest = command->enable;
1030         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1031 }
1032 void DPSOFTRAST_DepthTest(int enable)
1033 {
1034         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1035         command->enable = enable;
1036 }
1037
1038 DEFCOMMAND(6, ScissorTest, int enable;)
1039 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1040 {
1041         thread->scissortest = command->enable;
1042         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1043 }
1044 void DPSOFTRAST_ScissorTest(int enable)
1045 {
1046         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1047         command->enable = enable;
1048 }
1049
1050 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1051 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1052 {
1053         thread->scissor[0] = command->x;
1054         thread->scissor[1] = command->y;
1055         thread->scissor[2] = command->width;
1056         thread->scissor[3] = command->height;
1057         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1058 }
1059 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1060 {
1061         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1062         command->x = x;
1063         command->y = y;
1064         command->width = width;
1065         command->height = height;
1066 }
1067
1068 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1069 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1070 {
1071         thread->blendfunc[0] = command->sfactor;
1072         thread->blendfunc[1] = command->dfactor;
1073         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1074 }
1075 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1076 {
1077         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1078         command->sfactor = sfactor;
1079         command->dfactor = dfactor;
1080 }
1081
1082 DEFCOMMAND(9, BlendSubtract, int enable;)
1083 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1084 {
1085         thread->blendsubtract = command->enable;
1086         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1087 }
1088 void DPSOFTRAST_BlendSubtract(int enable)
1089 {
1090         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1091         command->enable = enable;
1092 }
1093
1094 DEFCOMMAND(10, DepthMask, int enable;)
1095 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1096 {
1097         thread->depthmask = command->enable;
1098 }
1099 void DPSOFTRAST_DepthMask(int enable)
1100 {
1101         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1102         command->enable = enable;
1103 }
1104
1105 DEFCOMMAND(11, DepthFunc, int func;)
1106 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1107 {
1108         thread->depthfunc = command->func;
1109 }
1110 void DPSOFTRAST_DepthFunc(int func)
1111 {
1112         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1113         command->func = func;
1114 }
1115
1116 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1117 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1118 {
1119         thread->depthrange[0] = command->nearval;
1120         thread->depthrange[1] = command->farval;
1121 }
1122 void DPSOFTRAST_DepthRange(float nearval, float farval)
1123 {
1124         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1125         command->nearval = nearval;
1126         command->farval = farval;
1127 }
1128
1129 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1130 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1131 {
1132         thread->polygonoffset[0] = command->alongnormal;
1133         thread->polygonoffset[1] = command->intoview;
1134 }
1135 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1136 {
1137         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1138         command->alongnormal = alongnormal;
1139         command->intoview = intoview;
1140 }
1141
1142 DEFCOMMAND(14, CullFace, int mode;)
1143 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1144 {
1145         thread->cullface = command->mode;
1146 }
1147 void DPSOFTRAST_CullFace(int mode)
1148 {
1149         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1150         command->mode = mode;
1151 }
1152
1153 DEFCOMMAND(15, AlphaTest, int enable;)
1154 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1155 {
1156         thread->alphatest = command->enable;
1157 }
1158 void DPSOFTRAST_AlphaTest(int enable)
1159 {
1160         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1161         command->enable = enable;
1162 }
1163
1164 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1165 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1166 {
1167         thread->alphafunc = command->func;
1168         thread->alphavalue = command->ref;
1169 }
1170 void DPSOFTRAST_AlphaFunc(int func, float ref)
1171 {
1172         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1173         command->func = func;
1174         command->ref = ref;
1175 }
1176
1177 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1178 {
1179         dpsoftrast.color[0] = r;
1180         dpsoftrast.color[1] = g;
1181         dpsoftrast.color[2] = b;
1182         dpsoftrast.color[3] = a;
1183 }
1184
1185 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1186 {
1187         int outstride = blockwidth * 4;
1188         int instride = dpsoftrast.fb_width * 4;
1189         int bx1 = blockx;
1190         int by1 = blocky;
1191         int bx2 = blockx + blockwidth;
1192         int by2 = blocky + blockheight;
1193         int bw;
1194         int x;
1195         int y;
1196         unsigned char *inpixels;
1197         unsigned char *b;
1198         unsigned char *o;
1199         DPSOFTRAST_Flush();
1200         if (bx1 < 0) bx1 = 0;
1201         if (by1 < 0) by1 = 0;
1202         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1203         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1204         bw = bx2 - bx1;
1205         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1206         if (dpsoftrast.bigendian)
1207         {
1208                 for (y = by1;y < by2;y++)
1209                 {
1210                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1211                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1212                         for (x = bx1;x < bx2;x++)
1213                         {
1214                                 o[0] = b[3];
1215                                 o[1] = b[2];
1216                                 o[2] = b[1];
1217                                 o[3] = b[0];
1218                                 o += 4;
1219                                 b += 4;
1220                         }
1221                 }
1222         }
1223         else
1224         {
1225                 for (y = by1;y < by2;y++)
1226                 {
1227                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1228                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1229                         memcpy(o, b, bw*4);
1230                 }
1231         }
1232
1233 }
1234 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1235 {
1236         int tx1 = tx;
1237         int ty1 = ty;
1238         int tx2 = tx + width;
1239         int ty2 = ty + height;
1240         int sx1 = sx;
1241         int sy1 = sy;
1242         int sx2 = sx + width;
1243         int sy2 = sy + height;
1244         int swidth;
1245         int sheight;
1246         int twidth;
1247         int theight;
1248         int sw;
1249         int sh;
1250         int tw;
1251         int th;
1252         int y;
1253         unsigned int *spixels;
1254         unsigned int *tpixels;
1255         DPSOFTRAST_Texture *texture;
1256         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1257         if (mip < 0 || mip >= texture->mipmaps) return;
1258         DPSOFTRAST_Flush();
1259         spixels = dpsoftrast.fb_colorpixels[0];
1260         swidth = dpsoftrast.fb_width;
1261         sheight = dpsoftrast.fb_height;
1262         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1263         twidth = texture->mipmap[mip][2];
1264         theight = texture->mipmap[mip][3];
1265         if (tx1 < 0) tx1 = 0;
1266         if (ty1 < 0) ty1 = 0;
1267         if (tx2 > twidth) tx2 = twidth;
1268         if (ty2 > theight) ty2 = theight;
1269         if (sx1 < 0) sx1 = 0;
1270         if (sy1 < 0) sy1 = 0;
1271         if (sx2 > swidth) sx2 = swidth;
1272         if (sy2 > sheight) sy2 = sheight;
1273         tw = tx2 - tx1;
1274         th = ty2 - ty1;
1275         sw = sx2 - sx1;
1276         sh = sy2 - sy1;
1277         if (tw > sw) tw = sw;
1278         if (th > sh) th = sh;
1279         if (tw < 1 || th < 1)
1280                 return;
1281         sy1 = sheight - 1 - sy1;
1282         for (y = 0;y < th;y++)
1283                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1284         if (texture->mipmaps > 1)
1285                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1286 }
1287
1288 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1289 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1290 {
1291         if (thread->texbound[command->unitnum])
1292                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1293         thread->texbound[command->unitnum] = command->texture;
1294 }
1295 void DPSOFTRAST_SetTexture(int unitnum, int index)
1296 {
1297         DPSOFTRAST_Command_SetTexture *command;
1298         DPSOFTRAST_Texture *texture;
1299         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1300         {
1301                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1302                 return;
1303         }
1304         texture = DPSOFTRAST_Texture_GetByIndex(index);
1305         if (index && !texture)
1306         {
1307                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1308                 return;
1309         }
1310
1311         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1312         command->unitnum = unitnum;
1313         command->texture = texture;
1314
1315         dpsoftrast.texbound[unitnum] = texture;
1316         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1317 }
1318
1319 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1320 {
1321         dpsoftrast.pointer_vertex3f = vertex3f;
1322         dpsoftrast.stride_vertex = stride;
1323 }
1324 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1325 {
1326         dpsoftrast.pointer_color4f = color4f;
1327         dpsoftrast.pointer_color4ub = NULL;
1328         dpsoftrast.stride_color = stride;
1329 }
1330 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1331 {
1332         dpsoftrast.pointer_color4f = NULL;
1333         dpsoftrast.pointer_color4ub = color4ub;
1334         dpsoftrast.stride_color = stride;
1335 }
1336 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1337 {
1338         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1339         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1340         dpsoftrast.stride_texcoord[unitnum] = stride;
1341 }
1342
1343 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1344 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1345 {
1346         thread->shader_mode = command->mode;
1347         thread->shader_permutation = command->permutation;
1348         thread->shader_exactspecularmath = command->exactspecularmath;
1349 }
1350 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1351 {
1352         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1353         command->mode = mode;
1354         command->permutation = permutation;
1355         command->exactspecularmath = exactspecularmath;
1356
1357         dpsoftrast.shader_mode = mode;
1358         dpsoftrast.shader_permutation = permutation;
1359         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1360 }
1361
1362 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1363 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1364 {
1365         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1366 }
1367 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1368 {
1369         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1370         command->index = index;
1371         command->val[0] = v0;
1372         command->val[1] = v1;
1373         command->val[2] = v2;
1374         command->val[3] = v3;
1375
1376         dpsoftrast.uniform4f[index*4+0] = v0;
1377         dpsoftrast.uniform4f[index*4+1] = v1;
1378         dpsoftrast.uniform4f[index*4+2] = v2;
1379         dpsoftrast.uniform4f[index*4+3] = v3;
1380 }
1381 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1382 {
1383         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1384         command->index = index;
1385         memcpy(command->val, v, sizeof(command->val));
1386
1387         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1388 }
1389
1390 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1391 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1392 {
1393         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1394 }
1395 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1396 {
1397 #ifdef SSE_POSSIBLE
1398         int i, index;
1399         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1400         {
1401                 __m128 m0, m1, m2, m3;
1402                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1403                 command->index = (DPSOFTRAST_UNIFORM)index;
1404                 if (((size_t)v)&(ALIGN_SIZE-1))
1405                 {
1406                         m0 = _mm_loadu_ps(v);
1407                         m1 = _mm_loadu_ps(v+4);
1408                         m2 = _mm_loadu_ps(v+8);
1409                         m3 = _mm_loadu_ps(v+12);
1410                 }
1411                 else
1412                 {
1413                         m0 = _mm_load_ps(v);
1414                         m1 = _mm_load_ps(v+4);
1415                         m2 = _mm_load_ps(v+8);
1416                         m3 = _mm_load_ps(v+12);
1417                 }
1418                 if (transpose)
1419                 {
1420                         __m128 t0, t1, t2, t3;
1421                         t0 = _mm_unpacklo_ps(m0, m1);
1422                         t1 = _mm_unpacklo_ps(m2, m3);
1423                         t2 = _mm_unpackhi_ps(m0, m1);
1424                         t3 = _mm_unpackhi_ps(m2, m3);
1425                         m0 = _mm_movelh_ps(t0, t1);
1426                         m1 = _mm_movehl_ps(t1, t0);
1427                         m2 = _mm_movelh_ps(t2, t3);
1428                         m3 = _mm_movehl_ps(t3, t2);                     
1429                 }
1430                 _mm_store_ps(command->val, m0);
1431                 _mm_store_ps(command->val+4, m1);
1432                 _mm_store_ps(command->val+8, m2);
1433                 _mm_store_ps(command->val+12, m3);
1434                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1435                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1436                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1437                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1438         }
1439 #endif
1440 }
1441
1442 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1443 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1444 {
1445         thread->uniform1i[command->index] = command->val;
1446 }
1447 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1448 {
1449         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1450         command->index = index;
1451         command->val = i0;
1452
1453         dpsoftrast.uniform1i[command->index] = i0;
1454 }
1455
1456 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1457 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1458 {
1459         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1460         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1461 }
1462 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1463 {
1464         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1465         command->clipplane[0] = x;
1466         command->clipplane[1] = y;
1467         command->clipplane[2] = z;
1468         command->clipplane[3] = w;
1469 }
1470
1471 #ifdef SSE_POSSIBLE
1472 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1473 {
1474         float *end = dst + size*4;
1475         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1476         {
1477                 while (dst < end)
1478                 {
1479                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1480                         dst += 4;
1481                         src += stride;
1482                 }
1483         }
1484         else
1485         {
1486                 while (dst < end)
1487                 {
1488                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1489                         dst += 4;
1490                         src += stride;
1491                 }
1492         }
1493 }
1494
1495 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1496 {
1497         float *end = dst + size*4;
1498         if (stride == sizeof(float[3]))
1499         {
1500                 float *end4 = dst + (size&~3)*4;        
1501                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1502                 {
1503                         while (dst < end4)
1504                         {
1505                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1506                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1507                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1508                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1510                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1511                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1512                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1513                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1514                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1515                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1516                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1517                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1518                                 dst += 16;
1519                                 src += 4*sizeof(float[3]);
1520                         }
1521                 }
1522                 else
1523                 {
1524                         while (dst < end4)
1525                         {
1526                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1527                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1528                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1529                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1531                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1532                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1533                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1534                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1535                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1536                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1537                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1538                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1539                                 dst += 16;
1540                                 src += 4*sizeof(float[3]);
1541                         }
1542                 }
1543         }
1544         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1545         {
1546                 while (dst < end)
1547                 {
1548                         __m128 v = _mm_loadu_ps((const float *)src);
1549                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1550                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1551                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1552                         _mm_store_ps(dst, v);
1553                         dst += 4;
1554                         src += stride;
1555                 }
1556         }
1557         else
1558         {
1559                 while (dst < end)
1560                 {
1561                         __m128 v = _mm_load_ps((const float *)src);
1562                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1563                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1564                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1565                         _mm_store_ps(dst, v);
1566                         dst += 4;
1567                         src += stride;
1568                 }
1569         }
1570 }
1571
1572 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1573 {
1574         float *end = dst + size*4;
1575         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1576         if (stride == sizeof(float[2]))
1577         {
1578                 float *end2 = dst + (size&~1)*4;
1579                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1580                 {
1581                         while (dst < end2)
1582                         {
1583                                 __m128 v = _mm_loadu_ps((const float *)src);
1584                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1585                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1586                                 dst += 8;
1587                                 src += 2*sizeof(float[2]);
1588                         }
1589                 }
1590                 else
1591                 {
1592                         while (dst < end2)
1593                         {
1594                                 __m128 v = _mm_load_ps((const float *)src);
1595                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1596                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1597                                 dst += 8;
1598                                 src += 2*sizeof(float[2]);
1599                         }
1600                 }
1601         }
1602         while (dst < end)
1603         {
1604                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1605                 dst += 4;
1606                 src += stride;
1607         }
1608 }
1609
1610 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1611 {
1612         float *end = dst + size*4;
1613         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1614         if (stride == sizeof(unsigned char[4]))
1615         {
1616                 float *end4 = dst + (size&~3)*4;
1617                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1618                 {
1619                         while (dst < end4)
1620                         {
1621                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1622                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1623                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1624                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1625                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1626                                 dst += 16;
1627                                 src += 4*sizeof(unsigned char[4]);
1628                         }
1629                 }
1630                 else
1631                 {
1632                         while (dst < end4)
1633                         {
1634                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1635                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1636                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1637                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1638                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1639                                 dst += 16;
1640                                 src += 4*sizeof(unsigned char[4]);
1641                         }
1642                 }
1643         }
1644         while (dst < end)
1645         {
1646                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1647                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1648                 dst += 4;
1649                 src += stride;
1650         }
1651 }
1652
1653 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1654 {
1655         float *end = dst + 4*size;
1656         __m128 v = _mm_loadu_ps(src);
1657         while (dst < end)
1658         {
1659                 _mm_store_ps(dst, v);
1660                 dst += 4;
1661         }
1662 }
1663 #endif
1664
1665 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1666 {
1667 #ifdef SSE_POSSIBLE
1668         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1669         __m128 m0, m1, m2, m3;
1670         float *end;
1671         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1672         {
1673                 // fast case for identity matrix
1674                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1675                 return;
1676         }
1677         end = out4f + numitems*4;
1678         m0 = _mm_loadu_ps(inmatrix16f);
1679         m1 = _mm_loadu_ps(inmatrix16f + 4);
1680         m2 = _mm_loadu_ps(inmatrix16f + 8);
1681         m3 = _mm_loadu_ps(inmatrix16f + 12);
1682         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1683         {
1684                 while (out4f < end)
1685                 {
1686                         __m128 v = _mm_loadu_ps(in4f);
1687                         _mm_store_ps(out4f,
1688                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1689                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1690                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1691                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1692                         out4f += 4;
1693                         in4f += 4;
1694                 }
1695         }
1696         else
1697         {
1698                 while (out4f < end)
1699                 {
1700                         __m128 v = _mm_load_ps(in4f);
1701                         _mm_store_ps(out4f,
1702                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1703                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1704                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1705                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1706                         out4f += 4;
1707                         in4f += 4;
1708                 }
1709         }
1710 #endif
1711 }
1712
1713 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1714 {
1715         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1716 }
1717
1718 #ifdef SSE_POSSIBLE
1719 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1720 { \
1721         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1722         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1723         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1724         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1725 }
1726
1727 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1728 { \
1729         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1730         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1731         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1732         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1733 }
1734
1735 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1736 { \
1737         __m128 p = (in); \
1738         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1739                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1740                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1741                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1742 }
1743
1744 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1745 {
1746         int clipmask = 0xFF;
1747         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1748         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1749         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1750         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1751         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1752         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1753         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1754         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1755         #define BBFRONT(k, pos) \
1756         { \
1757                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1758                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1759                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1760                 { \
1761                         __m128 proj; \
1762                         clipmask &= ~(1<<k); \
1763                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1764                         minproj = _mm_min_ss(minproj, proj); \
1765                         maxproj = _mm_max_ss(maxproj, proj); \
1766                 } \
1767         }
1768         BBFRONT(0, minpos); 
1769         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1770         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1771         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1772         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1773         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1774         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1775         BBFRONT(7, maxpos);
1776         #define BBCLIP(k) \
1777         { \
1778                 if (clipmask&(1<<k)) \
1779                 { \
1780                         if (!(clipmask&(1<<(k^1)))) \
1781                         { \
1782                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1783                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1784                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1785                                 minproj = _mm_min_ss(minproj, proj); \
1786                                 maxproj = _mm_max_ss(maxproj, proj); \
1787                         } \
1788                         if (!(clipmask&(1<<(k^2)))) \
1789                         { \
1790                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1791                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1792                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1793                                 minproj = _mm_min_ss(minproj, proj); \
1794                                 maxproj = _mm_max_ss(maxproj, proj); \
1795                         } \
1796                         if (!(clipmask&(1<<(k^4)))) \
1797                         { \
1798                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1799                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1800                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1801                                 minproj = _mm_min_ss(minproj, proj); \
1802                                 maxproj = _mm_max_ss(maxproj, proj); \
1803                         } \
1804                 } \
1805         }
1806         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1807         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1808         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1809         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1810         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1811         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1812         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1813         *starty = _mm_cvttss_si32(maxproj);
1814         *endy = _mm_cvttss_si32(minproj)+1;
1815         return clipmask;
1816 }
1817         
1818 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1819 {
1820         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1821         float *end = out4f + numitems*4;
1822         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1823         __m128 minpos, maxpos;
1824         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1825         {
1826                 minpos = maxpos = _mm_loadu_ps(in4f);
1827                 while (out4f < end)
1828                 {
1829                         __m128 v = _mm_loadu_ps(in4f);
1830                         minpos = _mm_min_ps(minpos, v);
1831                         maxpos = _mm_max_ps(maxpos, v);
1832                         _mm_store_ps(out4f, v);
1833                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1834                         _mm_store_ps(screen4f, v);
1835                         in4f += 4;
1836                         out4f += 4;
1837                         screen4f += 4;
1838                 }
1839         }
1840         else
1841         {
1842                 minpos = maxpos = _mm_load_ps(in4f);
1843                 while (out4f < end)
1844                 {
1845                         __m128 v = _mm_load_ps(in4f);
1846                         minpos = _mm_min_ps(minpos, v);
1847                         maxpos = _mm_max_ps(maxpos, v);
1848                         _mm_store_ps(out4f, v);
1849                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1850                         _mm_store_ps(screen4f, v);
1851                         in4f += 4;
1852                         out4f += 4;
1853                         screen4f += 4;
1854                 }
1855         }
1856         if (starty && endy) 
1857         {
1858                 ALIGN(float minposf[4]);
1859                 ALIGN(float maxposf[4]);
1860                 _mm_store_ps(minposf, minpos);
1861                 _mm_store_ps(maxposf, maxpos);
1862                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1863         }
1864         return 0;
1865 }
1866
1867 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1868 {
1869         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1870         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1871         float *end;
1872         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1873                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1874         end = out4f + numitems*4;
1875         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1876         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1877         m0 = _mm_loadu_ps(inmatrix16f);
1878         m1 = _mm_loadu_ps(inmatrix16f + 4);
1879         m2 = _mm_loadu_ps(inmatrix16f + 8);
1880         m3 = _mm_loadu_ps(inmatrix16f + 12);
1881         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1882         {
1883                 minpos = maxpos = _mm_loadu_ps(in4f);
1884                 while (out4f < end)
1885                 {
1886                         __m128 v = _mm_loadu_ps(in4f);
1887                         minpos = _mm_min_ps(minpos, v);
1888                         maxpos = _mm_max_ps(maxpos, v);
1889                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1890                         _mm_store_ps(out4f, v);
1891                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1892                         _mm_store_ps(screen4f, v);
1893                         in4f += 4;
1894                         out4f += 4;
1895                         screen4f += 4;
1896                 }
1897         }
1898         else
1899         {
1900                 minpos = maxpos = _mm_load_ps(in4f);
1901                 while (out4f < end)
1902                 {
1903                         __m128 v = _mm_load_ps(in4f);
1904                         minpos = _mm_min_ps(minpos, v);
1905                         maxpos = _mm_max_ps(maxpos, v);
1906                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1907                         _mm_store_ps(out4f, v);
1908                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1909                         _mm_store_ps(screen4f, v);
1910                         in4f += 4;
1911                         out4f += 4;
1912                         screen4f += 4;
1913                 }
1914         }
1915         if (starty && endy) 
1916         {
1917                 ALIGN(float minposf[4]);
1918                 ALIGN(float maxposf[4]);
1919                 _mm_store_ps(minposf, minpos);
1920                 _mm_store_ps(maxposf, maxpos);
1921                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1922         }
1923         return 0;
1924 }
1925 #endif
1926
1927 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1928 {
1929 #ifdef SSE_POSSIBLE
1930         float *outf = dpsoftrast.post_array4f[outarray];
1931         const unsigned char *inb;
1932         int firstvertex = dpsoftrast.firstvertex;
1933         int numvertices = dpsoftrast.numvertices;
1934         int stride;
1935         switch(inarray)
1936         {
1937         case DPSOFTRAST_ARRAY_POSITION:
1938                 stride = dpsoftrast.stride_vertex;
1939                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1940                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1941                 break;
1942         case DPSOFTRAST_ARRAY_COLOR:
1943                 stride = dpsoftrast.stride_color;
1944                 if (dpsoftrast.pointer_color4f)
1945                 {
1946                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1947                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1948                 }
1949                 else if (dpsoftrast.pointer_color4ub)
1950                 {
1951                         stride = dpsoftrast.stride_color;
1952                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1953                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1954                 }
1955                 else
1956                 {
1957                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1958                 }
1959                 break;
1960         default:
1961                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1962                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1963                 {
1964                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1965                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1966                         {
1967                         case 2:
1968                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1969                                 break;
1970                         case 3:
1971                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1972                                 break;
1973                         case 4:
1974                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1975                                 break;
1976                         }
1977                 }
1978                 break;
1979         }
1980         return outf;
1981 #else
1982         return NULL;
1983 #endif
1984 }
1985
1986 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1987 {
1988         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1989         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1990         return data;
1991 }
1992
1993 #if 0
1994 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1995 {
1996 #ifdef SSE_POSSIBLE
1997         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1998         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1999         return data;
2000 #else
2001         return NULL;
2002 #endif
2003 }
2004 #endif
2005
2006 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2007 {
2008 #ifdef SSE_POSSIBLE
2009         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2010         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2011         return data;
2012 #else
2013         return NULL;
2014 #endif
2015 }
2016
2017 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2018 {
2019         int x;
2020         int startx = span->startx;
2021         int endx = span->endx;
2022         float wslope = triangle->w[0];
2023         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2024         float endz = 1.0f / (w + wslope * startx);
2025         if (triangle->w[0] == 0)
2026         {
2027                 // LordHavoc: fast flat polygons (HUD/menu)
2028                 for (x = startx;x < endx;x++)
2029                         zf[x] = endz;
2030                 return;
2031         }
2032         for (x = startx;x < endx;)
2033         {
2034                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2035                 float z = endz, dz;
2036                 if (nextsub >= endx) nextsub = endsub = endx-1;
2037                 endz = 1.0f / (w + wslope * nextsub);
2038                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2039                 for (; x <= endsub; x++, z += dz)
2040                         zf[x] = z;
2041         }
2042 }
2043
2044 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2045 {
2046 #ifdef SSE_POSSIBLE
2047         int x;
2048         int startx = span->startx;
2049         int endx = span->endx;
2050         int subx;
2051         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2052         unsigned char * RESTRICT pixelmask = span->pixelmask;
2053         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2054         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2055         if (!pixel)
2056                 return;
2057         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2058         pixeli += span->y * dpsoftrast.fb_width + span->x;
2059         // handle alphatest now (this affects depth writes too)
2060         if (thread->alphatest)
2061                 for (x = startx;x < endx;x++)
2062                         if (in4ub[x*4+3] < 128)
2063                                 pixelmask[x] = false;
2064         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2065         // helps sprites, text and hud artwork
2066         switch(thread->fb_blendmode)
2067         {
2068         case DPSOFTRAST_BLENDMODE_ALPHA:
2069         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2070         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2071                 for (x = startx;x < endx;x++)
2072                         if (in4ub[x*4+3] < 1)
2073                                 pixelmask[x] = false;
2074                 break;
2075         case DPSOFTRAST_BLENDMODE_OPAQUE:
2076         case DPSOFTRAST_BLENDMODE_ADD:
2077         case DPSOFTRAST_BLENDMODE_INVMOD:
2078         case DPSOFTRAST_BLENDMODE_MUL:
2079         case DPSOFTRAST_BLENDMODE_MUL2:
2080         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2081         case DPSOFTRAST_BLENDMODE_INVADD:
2082                 break;
2083         }
2084         // put some special values at the end of the mask to ensure the loops end
2085         pixelmask[endx] = 1;
2086         pixelmask[endx+1] = 0;
2087         // LordHavoc: use a double loop to identify subspans, this helps the
2088         // optimized copy/blend loops to perform at their best, most triangles
2089         // have only one run of pixels, and do the search using wide reads...
2090         x = startx;
2091         while (x < endx)
2092         {
2093                 // if this pixel is masked off, it's probably not alone...
2094                 if (!pixelmask[x])
2095                 {
2096                         x++;
2097 #if 1
2098                         if (x + 8 < endx)
2099                         {
2100                                 // the 4-item search must be aligned or else it stalls badly
2101                                 if ((x & 3) && !pixelmask[x]) 
2102                                 {
2103                                         x++;
2104                                         if ((x & 3) && !pixelmask[x]) 
2105                                         {
2106                                                 x++;
2107                                                 if ((x & 3) && !pixelmask[x]) x++;
2108                                         }
2109                                 }
2110                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2111                                         x += 4;
2112                         }
2113 #endif
2114                         for (;!pixelmask[x];x++)
2115                                 ;
2116                         // rather than continue the loop, just check the end variable
2117                         if (x >= endx)
2118                                 break;
2119                 }
2120                 // find length of subspan
2121                 subx = x + 1;
2122 #if 1
2123                 if (x + 8 < endx)
2124                 {
2125                         if ((subx & 3) && pixelmask[subx]) 
2126                         {
2127                                 subx++;
2128                                 if ((subx & 3) && pixelmask[subx]) 
2129                                 {
2130                                         subx++;
2131                                         if ((subx & 3) && pixelmask[subx]) subx++;
2132                                 }
2133                         }
2134                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2135                                 subx += 4;
2136                 }
2137 #endif
2138                 for (;pixelmask[subx];subx++)
2139                         ;
2140                 // the checks can overshoot, so make sure to clip it...
2141                 if (subx > endx)
2142                         subx = endx;
2143                 // now that we know the subspan length...  process!
2144                 switch(thread->fb_blendmode)
2145                 {
2146                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2147 #if 0
2148                         if (subx - x >= 16)
2149                         {
2150                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2151                                 x = subx;
2152                         }
2153                         else
2154 #elif 1
2155                         while (x + 16 <= subx)
2156                         {
2157                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2158                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2159                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2160                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2161                                 x += 16;
2162                         }
2163 #endif
2164                         {
2165                                 while (x + 4 <= subx)
2166                                 {
2167                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2168                                         x += 4;
2169                                 }
2170                                 if (x + 2 <= subx)
2171                                 {
2172                                         pixeli[x] = ini[x];
2173                                         pixeli[x+1] = ini[x+1];
2174                                         x += 2;
2175                                 }
2176                                 if (x < subx)
2177                                 {
2178                                         pixeli[x] = ini[x];
2179                                         x++;
2180                                 }
2181                         }
2182                         break;
2183                 case DPSOFTRAST_BLENDMODE_ALPHA:
2184                 #define FINISHBLEND(blend2, blend1) \
2185                         for (;x + 1 < subx;x += 2) \
2186                         { \
2187                                 __m128i src, dst; \
2188                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2189                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2190                                 blend2; \
2191                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2192                         } \
2193                         if (x < subx) \
2194                         { \
2195                                 __m128i src, dst; \
2196                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2197                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2198                                 blend1; \
2199                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2200                                 x++; \
2201                         }
2202                         FINISHBLEND({
2203                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2204                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2205                         }, {
2206                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2207                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2208                         });
2209                         break;
2210                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2211                         FINISHBLEND({
2212                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2213                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2214                         }, {
2215                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2216                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2217                         });
2218                         break;
2219                 case DPSOFTRAST_BLENDMODE_ADD:
2220                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2221                         break;
2222                 case DPSOFTRAST_BLENDMODE_INVMOD:
2223                         FINISHBLEND({
2224                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2225                         }, {
2226                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2227                         });
2228                         break;
2229                 case DPSOFTRAST_BLENDMODE_MUL:
2230                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2231                         break;
2232                 case DPSOFTRAST_BLENDMODE_MUL2:
2233                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2234                         break;
2235                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2236                         FINISHBLEND({
2237                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2238                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239                         }, {
2240                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2241                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2242                         });
2243                         break;
2244                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2245                         FINISHBLEND({
2246                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2247                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2248                         }, {
2249                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2250                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2251                         });
2252                         break;
2253                 case DPSOFTRAST_BLENDMODE_INVADD:
2254                         FINISHBLEND({
2255                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2256                         }, {
2257                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2258                         });
2259                         break;
2260                 }
2261         }
2262 #endif
2263 }
2264
2265 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2266 {
2267         int x;
2268         int startx = span->startx;
2269         int endx = span->endx;
2270         int flags;
2271         float c[4];
2272         float data[4];
2273         float slope[4];
2274         float tc[2], endtc[2];
2275         float tcscale[2];
2276         unsigned int tci[2];
2277         unsigned int tci1[2];
2278         unsigned int tcimin[2];
2279         unsigned int tcimax[2];
2280         int tciwrapmask[2];
2281         int tciwidth;
2282         int filter;
2283         int mip;
2284         const unsigned char * RESTRICT pixelbase;
2285         const unsigned char * RESTRICT pixel[4];
2286         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2287         // if no texture is bound, just fill it with white
2288         if (!texture)
2289         {
2290                 for (x = startx;x < endx;x++)
2291                 {
2292                         out4f[x*4+0] = 1.0f;
2293                         out4f[x*4+1] = 1.0f;
2294                         out4f[x*4+2] = 1.0f;
2295                         out4f[x*4+3] = 1.0f;
2296                 }
2297                 return;
2298         }
2299         mip = triangle->mip[texunitindex];
2300         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2301         // if this mipmap of the texture is 1 pixel, just fill it with that color
2302         if (texture->mipmap[mip][1] == 4)
2303         {
2304                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2305                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2306                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2307                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2308                 for (x = startx;x < endx;x++)
2309                 {
2310                         out4f[x*4+0] = c[0];
2311                         out4f[x*4+1] = c[1];
2312                         out4f[x*4+2] = c[2];
2313                         out4f[x*4+3] = c[3];
2314                 }
2315                 return;
2316         }
2317         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2318         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2319         flags = texture->flags;
2320         tcscale[0] = texture->mipmap[mip][2];
2321         tcscale[1] = texture->mipmap[mip][3];
2322         tciwidth = texture->mipmap[mip][2];
2323         tcimin[0] = 0;
2324         tcimin[1] = 0;
2325         tcimax[0] = texture->mipmap[mip][2]-1;
2326         tcimax[1] = texture->mipmap[mip][3]-1;
2327         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2328         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2329         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2330         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2331         if (filter)
2332         {
2333                 endtc[0] -= 0.5f;
2334                 endtc[1] -= 0.5f;
2335         }
2336         for (x = startx;x < endx;)
2337         {
2338                 unsigned int subtc[2];
2339                 unsigned int substep[2];
2340                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2341                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2342                 if (nextsub >= endx)
2343                 {
2344                         nextsub = endsub = endx-1;      
2345                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2346                 }
2347                 tc[0] = endtc[0];
2348                 tc[1] = endtc[1];
2349                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2350                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2351                 if (filter)
2352                 {
2353                         endtc[0] -= 0.5f;
2354                         endtc[1] -= 0.5f;
2355                 }
2356                 substep[0] = (endtc[0] - tc[0]) * subscale;
2357                 substep[1] = (endtc[1] - tc[1]) * subscale;
2358                 subtc[0] = tc[0] * (1<<12);
2359                 subtc[1] = tc[1] * (1<<12);
2360                 if (filter)
2361                 {
2362                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2363                         {
2364                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2365                                 {
2366                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2367                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2368                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2369                                         tci[0] = subtc[0]>>12;
2370                                         tci[1] = subtc[1]>>12;
2371                                         tci1[0] = tci[0] + 1;
2372                                         tci1[1] = tci[1] + 1;
2373                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2374                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2375                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2376                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2377                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2378                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2379                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2380                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2381                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2382                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2383                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2384                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2385                                         out4f[x*4+0] = c[0];
2386                                         out4f[x*4+1] = c[1];
2387                                         out4f[x*4+2] = c[2];
2388                                         out4f[x*4+3] = c[3];
2389                                 }
2390                         }
2391                         else
2392                         {
2393                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2394                                 {
2395                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2396                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2397                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2398                                         tci[0] = subtc[0]>>12;
2399                                         tci[1] = subtc[1]>>12;
2400                                         tci1[0] = tci[0] + 1;
2401                                         tci1[1] = tci[1] + 1;
2402                                         tci[0] &= tciwrapmask[0];
2403                                         tci[1] &= tciwrapmask[1];
2404                                         tci1[0] &= tciwrapmask[0];
2405                                         tci1[1] &= tciwrapmask[1];
2406                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2407                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2408                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2409                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2410                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2411                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2412                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2413                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2414                                         out4f[x*4+0] = c[0];
2415                                         out4f[x*4+1] = c[1];
2416                                         out4f[x*4+2] = c[2];
2417                                         out4f[x*4+3] = c[3];
2418                                 }
2419                         }
2420                 }
2421                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2422                 {
2423                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2424                         {
2425                                 tci[0] = subtc[0]>>12;
2426                                 tci[1] = subtc[1]>>12;
2427                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2428                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2429                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2430                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2431                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2432                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2433                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2434                                 out4f[x*4+0] = c[0];
2435                                 out4f[x*4+1] = c[1];
2436                                 out4f[x*4+2] = c[2];
2437                                 out4f[x*4+3] = c[3];
2438                         }
2439                 }
2440                 else
2441                 {
2442                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2443                         {
2444                                 tci[0] = subtc[0]>>12;
2445                                 tci[1] = subtc[1]>>12;
2446                                 tci[0] &= tciwrapmask[0];
2447                                 tci[1] &= tciwrapmask[1];
2448                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2449                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2450                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2451                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2452                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2453                                 out4f[x*4+0] = c[0];
2454                                 out4f[x*4+1] = c[1];
2455                                 out4f[x*4+2] = c[2];
2456                                 out4f[x*4+3] = c[3];
2457                         }
2458                 }
2459         }
2460 }
2461
2462 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2463 {
2464 #ifdef SSE_POSSIBLE
2465         int x;
2466         int startx = span->startx;
2467         int endx = span->endx;
2468         int flags;
2469         __m128 data, slope, tcscale;
2470         __m128i tcsize, tcmask, tcoffset, tcmax;
2471         __m128 tc, endtc;
2472         __m128i subtc, substep, endsubtc;
2473         int filter;
2474         int mip;
2475         int affine; // LordHavoc: optimized affine texturing case
2476         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2477         const unsigned char * RESTRICT pixelbase;
2478         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2479         // if no texture is bound, just fill it with white
2480         if (!texture)
2481         {
2482                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2483                 return;
2484         }
2485         mip = triangle->mip[texunitindex];
2486         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2487         // if this mipmap of the texture is 1 pixel, just fill it with that color
2488         if (texture->mipmap[mip][1] == 4)
2489         {
2490                 unsigned int k = *((const unsigned int *)pixelbase);
2491                 for (x = startx;x < endx;x++)
2492                         outi[x] = k;
2493                 return;
2494         }
2495         affine = zf[startx] == zf[endx-1];
2496         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2497         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2498         flags = texture->flags;
2499         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2500         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2501         tcscale = _mm_cvtepi32_ps(tcsize);
2502         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2503         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2504         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2505         if (filter)
2506                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2507         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2508         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2509         tcmax = _mm_packs_epi32(tcmask, tcmask);
2510         for (x = startx;x < endx;)
2511         {
2512                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2513                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2514                 if (nextsub >= endx || affine)
2515                 {
2516                         nextsub = endsub = endx-1;
2517                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2518                 }       
2519                 tc = endtc;
2520                 subtc = endsubtc;
2521                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2522                 if (filter)
2523                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2524                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2525                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2526                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2527                 substep = _mm_slli_epi32(substep, 1);
2528                 if (filter)
2529                 {
2530                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2531                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2532                         {
2533                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2534                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2535                                 {
2536                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2537                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2538                                         tci = _mm_madd_epi16(tci, tcoffset);
2539                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2540                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2541                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2542                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2543                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2544                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2545                                         fracm = _mm_srli_epi16(subtc, 1);
2546                                         pix1 = _mm_add_epi16(pix1,
2547                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2548                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2549                                         pix3 = _mm_add_epi16(pix3,
2550                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2551                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2552                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2553                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2554                                         pix2 = _mm_add_epi16(pix2,
2555                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2556                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2557                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2558                                 }
2559                                 if (x <= endsub)
2560                                 {
2561                                         const unsigned char * RESTRICT ptr1;
2562                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2563                                         tci = _mm_madd_epi16(tci, tcoffset);
2564                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2565                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2566                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2567                                         fracm = _mm_srli_epi16(subtc, 1);
2568                                         pix1 = _mm_add_epi16(pix1,
2569                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2570                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2571                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2572                                         pix1 = _mm_add_epi16(pix1,
2573                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2574                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2575                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2576                                         x++;
2577                                 }
2578                         }
2579                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2580                         {
2581                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2582                                 {
2583                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2584                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2585                                         tci = _mm_madd_epi16(tci, tcoffset);
2586                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2587                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2588                                                                                         _mm_setzero_si128());
2589                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2590                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2591                                                                                         _mm_setzero_si128());
2592                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2593                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2594                                         tci = _mm_madd_epi16(tci, tcoffset);
2595                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2596                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2597                                                                                         _mm_setzero_si128());
2598                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2599                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2600                                                                                         _mm_setzero_si128());
2601                                         fracm = _mm_srli_epi16(subtc, 1);
2602                                         pix1 = _mm_add_epi16(pix1,
2603                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2604                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2605                                         pix3 = _mm_add_epi16(pix3,
2606                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2607                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2608                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2609                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2610                                         pix2 = _mm_add_epi16(pix2,
2611                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2612                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2613                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2614                                 }
2615                                 if (x <= endsub)
2616                                 {
2617                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2618                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2619                                         tci = _mm_madd_epi16(tci, tcoffset);
2620                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2621                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2622                                                                                         _mm_setzero_si128());
2623                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2624                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2625                                                                                         _mm_setzero_si128());
2626                                         fracm = _mm_srli_epi16(subtc, 1);
2627                                         pix1 = _mm_add_epi16(pix1,
2628                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2629                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2630                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2631                                         pix1 = _mm_add_epi16(pix1,
2632                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2633                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2634                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2635                                         x++;
2636                                 }
2637                         }
2638                         else
2639                         {
2640                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2641                                 {
2642                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2643                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2644                                         tci = _mm_madd_epi16(tci, tcoffset);
2645                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2646                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2647                                                                                         _mm_setzero_si128());
2648                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2649                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2650                                                                                         _mm_setzero_si128());
2651                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2652                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2653                                         tci = _mm_madd_epi16(tci, tcoffset);
2654                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2655                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2656                                                                                         _mm_setzero_si128());
2657                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2658                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2659                                                                                         _mm_setzero_si128());
2660                                         fracm = _mm_srli_epi16(subtc, 1);
2661                                         pix1 = _mm_add_epi16(pix1,
2662                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2663                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2664                                         pix3 = _mm_add_epi16(pix3,
2665                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2666                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2667                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2668                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2669                                         pix2 = _mm_add_epi16(pix2,
2670                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2671                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2672                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2673                                 }
2674                                 if (x <= endsub)
2675                                 {
2676                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2677                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2678                                         tci = _mm_madd_epi16(tci, tcoffset);
2679                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2680                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2681                                                                                         _mm_setzero_si128());
2682                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2683                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2684                                                                                         _mm_setzero_si128());
2685                                         fracm = _mm_srli_epi16(subtc, 1);
2686                                         pix1 = _mm_add_epi16(pix1,
2687                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2688                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2689                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2690                                         pix1 = _mm_add_epi16(pix1,
2691                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2692                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2693                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2694                                         x++;
2695                                 }
2696                         }
2697                 }
2698                 else
2699                 {
2700                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2701                         {
2702                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2703                                 {
2704                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2705                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2706                                         tci = _mm_madd_epi16(tci, tcoffset);
2707                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2708                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2709                                 }
2710                                 if (x <= endsub)
2711                                 {
2712                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2713                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2714                                         tci = _mm_madd_epi16(tci, tcoffset);
2715                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2716                                         x++;
2717                                 }
2718                         }
2719                         else
2720                         {
2721                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2722                                 {
2723                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2724                                         tci = _mm_and_si128(tci, tcmax); 
2725                                         tci = _mm_madd_epi16(tci, tcoffset);
2726                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2727                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2728                                 }
2729                                 if (x <= endsub)
2730                                 {
2731                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2732                                         tci = _mm_and_si128(tci, tcmax); 
2733                                         tci = _mm_madd_epi16(tci, tcoffset);
2734                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2735                                         x++;
2736                                 }
2737                         }
2738                 }
2739         }
2740 #endif
2741 }
2742
2743 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2744 {
2745         // TODO: IMPLEMENT
2746         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2747 }
2748
2749 float DPSOFTRAST_SampleShadowmap(const float *vector)
2750 {
2751         // TODO: IMPLEMENT
2752         return 1.0f;
2753 }
2754
2755 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2756 {
2757         int x;
2758         int startx = span->startx;
2759         int endx = span->endx;
2760         float c[4];
2761         float data[4];
2762         float slope[4];
2763         float z;
2764         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2765         for (x = startx;x < endx;x++)
2766         {
2767                 z = zf[x];
2768                 c[0] = (data[0] + slope[0]*x) * z;
2769                 c[1] = (data[1] + slope[1]*x) * z;
2770                 c[2] = (data[2] + slope[2]*x) * z;
2771                 c[3] = (data[3] + slope[3]*x) * z;
2772                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2773                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2774                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2775                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2776         }
2777 }
2778
2779 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2780 {
2781         int x;
2782         int startx = span->startx;
2783         int endx = span->endx;
2784         float c[4];
2785         float data[4];
2786         float slope[4];
2787         float z;
2788         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2789         for (x = startx;x < endx;x++)
2790         {
2791                 z = zf[x];
2792                 c[0] = (data[0] + slope[0]*x) * z;
2793                 c[1] = (data[1] + slope[1]*x) * z;
2794                 c[2] = (data[2] + slope[2]*x) * z;
2795                 c[3] = (data[3] + slope[3]*x) * z;
2796                 out4f[x*4+0] = c[0];
2797                 out4f[x*4+1] = c[1];
2798                 out4f[x*4+2] = c[2];
2799                 out4f[x*4+3] = c[3];
2800         }
2801 }
2802
2803 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2804 {
2805         int x, startx = span->startx, endx = span->endx;
2806         float c[4], localcolor[4];
2807         localcolor[0] = subcolor[0];
2808         localcolor[1] = subcolor[1];
2809         localcolor[2] = subcolor[2];
2810         localcolor[3] = subcolor[3];
2811         for (x = startx;x < endx;x++)
2812         {
2813                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2814                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2815                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2816                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2817                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2818                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2819                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2820                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2821         }
2822 }
2823
2824 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2825 {
2826         int x, startx = span->startx, endx = span->endx;
2827         for (x = startx;x < endx;x++)
2828         {
2829                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2830                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2831                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2832                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2833         }
2834 }
2835
2836 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2837 {
2838         int x, startx = span->startx, endx = span->endx;
2839         for (x = startx;x < endx;x++)
2840         {
2841                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2842                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2843                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2844                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2845         }
2846 }
2847
2848 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2849 {
2850         int x, startx = span->startx, endx = span->endx;
2851         float a, b;
2852         for (x = startx;x < endx;x++)
2853         {
2854                 a = 1.0f - inb4f[x*4+3];
2855                 b = inb4f[x*4+3];
2856                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2857                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2858                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2859                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2860         }
2861 }
2862
2863 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2864 {
2865         int x, startx = span->startx, endx = span->endx;
2866         float localcolor[4], ilerp, lerp;
2867         localcolor[0] = color[0];
2868         localcolor[1] = color[1];
2869         localcolor[2] = color[2];
2870         localcolor[3] = color[3];
2871         ilerp = 1.0f - localcolor[3];
2872         lerp = localcolor[3];
2873         for (x = startx;x < endx;x++)
2874         {
2875                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2876                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2877                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2878                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2879         }
2880 }
2881
2882
2883
2884 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2885 {
2886 #ifdef SSE_POSSIBLE
2887         int x;
2888         int startx = span->startx;
2889         int endx = span->endx;
2890         __m128 data, slope;
2891         __m128 mod, endmod;
2892         __m128i submod, substep, endsubmod;
2893         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2894         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2895         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2896         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2897         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2898         for (x = startx; x < endx;)
2899         {
2900                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2901                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2902                 if (nextsub >= endx)
2903                 {
2904                         nextsub = endsub = endx-1;
2905                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2906                 }
2907                 mod = endmod;
2908                 submod = endsubmod;
2909                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2910                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2911                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2912                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2913                 substep = _mm_packs_epi32(substep, substep);
2914                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2915                 {
2916                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2917                         pix = _mm_mulhi_epu16(pix, submod);
2918                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2919                 }
2920                 if (x <= endsub)
2921                 {
2922                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2923                         pix = _mm_mulhi_epu16(pix, submod);
2924                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2925                         x++;
2926                 }
2927         }
2928 #endif
2929 }
2930
2931 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2932 {
2933 #ifdef SSE_POSSIBLE
2934         int x;
2935         int startx = span->startx;
2936         int endx = span->endx;
2937         __m128 data, slope;
2938         __m128 mod, endmod;
2939         __m128i submod, substep, endsubmod;
2940         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2941         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2942         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2943         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2944         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2945         for (x = startx; x < endx;)
2946         {
2947                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2948                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2949                 if (nextsub >= endx)
2950                 {
2951                         nextsub = endsub = endx-1;
2952                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2953                 }
2954                 mod = endmod;
2955                 submod = endsubmod;
2956                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2957                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2958                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2959                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2960                 substep = _mm_packs_epi32(substep, substep);
2961                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2962                 {
2963                         __m128i pix = _mm_srai_epi16(submod, 4);
2964                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2965                 }
2966                 if (x <= endsub)
2967                 {
2968                         __m128i pix = _mm_srai_epi16(submod, 4);
2969                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2970                         x++;
2971                 }
2972         }
2973 #endif
2974 }
2975
2976 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2977 {
2978 #ifdef SSE_POSSIBLE
2979         int x, startx = span->startx, endx = span->endx;
2980         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2981         localcolor = _mm_packs_epi32(localcolor, localcolor);
2982         for (x = startx;x+2 <= endx;x+=2)
2983         {
2984                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2985                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2986                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
2987                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2988         }
2989         if (x < endx)
2990         {
2991                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2992                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2993                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
2994                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2995         }
2996 #endif
2997 }
2998
2999 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3000 {
3001 #ifdef SSE_POSSIBLE
3002         int x, startx = span->startx, endx = span->endx;
3003         for (x = startx;x+2 <= endx;x+=2)
3004         {
3005                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3006                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3007                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3008                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3009         }
3010         if (x < endx)
3011         {
3012                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3013                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3014                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3015                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3016         }
3017 #endif
3018 }
3019
3020 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3021 {
3022 #ifdef SSE_POSSIBLE
3023         int x, startx = span->startx, endx = span->endx;
3024         for (x = startx;x+2 <= endx;x+=2)
3025         {
3026                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3027                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3028                 pix1 = _mm_add_epi16(pix1, pix2);
3029                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3030         }
3031         if (x < endx)
3032         {
3033                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3034                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3035                 pix1 = _mm_add_epi16(pix1, pix2);
3036                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3037         }
3038 #endif
3039 }
3040
3041 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3042 {
3043 #ifdef SSE_POSSIBLE
3044         int x, startx = span->startx, endx = span->endx;
3045         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3046         tint = _mm_packs_epi32(tint, tint);
3047         for (x = startx;x+2 <= endx;x+=2)
3048         {
3049                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3050                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3051                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3052                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3053         }
3054         if (x < endx)
3055         {
3056                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3057                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3058                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3059                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3060         }
3061 #endif
3062 }
3063
3064 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3065 {
3066 #ifdef SSE_POSSIBLE
3067         int x, startx = span->startx, endx = span->endx;
3068         for (x = startx;x+2 <= endx;x+=2)
3069         {
3070                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3071                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3072                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3073                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3074                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3075         }
3076         if (x < endx)
3077         {
3078                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3079                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3080                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3081                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3082                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3083         }
3084 #endif
3085 }
3086
3087 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3088 {
3089 #ifdef SSE_POSSIBLE
3090         int x, startx = span->startx, endx = span->endx;
3091         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3092         localcolor = _mm_packs_epi32(localcolor, localcolor);
3093         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3094         for (x = startx;x+2 <= endx;x+=2)
3095         {
3096                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3097                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3098                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3099         }
3100         if (x < endx)
3101         {
3102                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3103                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3104                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3105         }
3106 #endif
3107 }
3108
3109
3110
3111 void DPSOFTRAST_VertexShader_Generic(void)
3112 {
3113         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3114         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3115         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3116         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3117                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3118 }
3119
3120 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3121 {
3122         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3123         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3124         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3125         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3126         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3127         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3128         {
3129                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3130                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3131                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3132                 {
3133                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3134                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3135                         {
3136                                 // multiply
3137                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3138                         }
3139                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3140                         {
3141                                 // add
3142                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3143                         }
3144                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3145                         {
3146                                 // alphablend
3147                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3148                         }
3149                 }
3150         }
3151         else
3152                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3153         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3154 }
3155
3156
3157
3158 void DPSOFTRAST_VertexShader_PostProcess(void)
3159 {
3160         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3161         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3162         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3163 }
3164
3165 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3166 {
3167         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3168         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3169         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3170         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3171         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3172         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3173         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3174         {
3175                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3176                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3177         }
3178         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3179         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3180         {
3181                 // TODO: implement saturation
3182         }
3183         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3184         {
3185                 // TODO: implement gammaramps
3186         }
3187         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3188 }
3189
3190
3191
3192 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3193 {
3194         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3195 }
3196
3197 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3198 {
3199         // this is never called (because colormask is off when this shader is used)
3200         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3201         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3202         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3203         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3204         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3205 }
3206
3207
3208
3209 void DPSOFTRAST_VertexShader_FlatColor(void)
3210 {
3211         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3212         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3213 }
3214
3215 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3216 {
3217 #ifdef SSE_POSSIBLE
3218         unsigned char * RESTRICT pixelmask = span->pixelmask;
3219         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3220         int x, startx = span->startx, endx = span->endx;
3221         __m128i Color_Ambientm;
3222         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3223         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3224         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3225         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3226         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3227         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3228                 pixel = buffer_FragColorbgra8;
3229         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3230         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3231         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3232         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3233         for (x = startx;x < endx;x++)
3234         {
3235                 __m128i color, pix;
3236                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3237                 {
3238                         __m128i pix2;
3239                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3240                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3241                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3242                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3243                         x += 3;
3244                         continue;
3245                 }
3246                 if (!pixelmask[x])
3247                         continue;
3248                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3249                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3250                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3251         }
3252         if (pixel == buffer_FragColorbgra8)
3253                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3254 #endif
3255 }
3256
3257
3258
3259 void DPSOFTRAST_VertexShader_VertexColor(void)
3260 {
3261         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3262         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3263         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3264 }
3265
3266 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3267 {
3268 #ifdef SSE_POSSIBLE
3269         unsigned char * RESTRICT pixelmask = span->pixelmask;
3270         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3271         int x, startx = span->startx, endx = span->endx;
3272         __m128i Color_Ambientm, Color_Diffusem;
3273         __m128 data, slope;
3274         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3275         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3276         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3277         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3278         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3279         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3280         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3281                 pixel = buffer_FragColorbgra8;
3282         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3283         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3284         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3285         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3286         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3287         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3288         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3289         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3290         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3291         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3292         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3293         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3294         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3295         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3296         {
3297                 __m128i color, mod, pix;
3298                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3299                 {
3300                         __m128i pix2, mod2;
3301                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3302                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3303                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3304                         data = _mm_add_ps(data, slope);
3305                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3306                         data = _mm_add_ps(data, slope);
3307                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3308                         data = _mm_add_ps(data, slope);
3309                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3310                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3311                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3312                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3313                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3314                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3315                         x += 3;
3316                         continue;
3317                 }
3318                 if (!pixelmask[x])
3319                         continue;
3320                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3321                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3322                 mod = _mm_packs_epi32(mod, mod);
3323                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3324                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3325         }
3326         if (pixel == buffer_FragColorbgra8)
3327                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3328 #endif
3329 }
3330
3331
3332
3333 void DPSOFTRAST_VertexShader_Lightmap(void)
3334 {
3335         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3336         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3337         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3338 }
3339
3340 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3341 {
3342 #ifdef SSE_POSSIBLE
3343         unsigned char * RESTRICT pixelmask = span->pixelmask;
3344         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3345         int x, startx = span->startx, endx = span->endx;
3346         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3347         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3348         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3349         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3350         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3351         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3352         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3353         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3354         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3355         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3356                 pixel = buffer_FragColorbgra8;
3357         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3358         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3359         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3360         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3361         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3362         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3363         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3364         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3365         {
3366                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3367                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3368                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3369                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3370                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3371                 for (x = startx;x < endx;x++)
3372                 {
3373                         __m128i color, lightmap, glow, pix;
3374                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3375                         {
3376                                 __m128i pix2;
3377                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3378                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3379                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3380                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3381                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3382                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3383                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3384                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3385                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3386                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3387                                 x += 3;
3388                                 continue;
3389                         }
3390                         if (!pixelmask[x])
3391                                 continue;
3392                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3393                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3394                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3395                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3396                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3397                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3398                 }
3399         }
3400         else
3401         {
3402                 for (x = startx;x < endx;x++)
3403                 {
3404                         __m128i color, lightmap, pix;
3405                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3406                         {
3407                                 __m128i pix2;
3408                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3409                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3410                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3411                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3412                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3413                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3414                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3415                                 x += 3;
3416                                 continue;
3417                         }
3418                         if (!pixelmask[x]) 
3419                                 continue;
3420                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3421                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3422                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3423                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3424                 }
3425         }
3426         if (pixel == buffer_FragColorbgra8)
3427                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3428 #endif
3429 }
3430
3431
3432 void DPSOFTRAST_VertexShader_LightDirection(void);
3433 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3434
3435 void DPSOFTRAST_VertexShader_FakeLight(void)
3436 {
3437         DPSOFTRAST_VertexShader_LightDirection();
3438 }
3439
3440 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3441 {
3442         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3443 }
3444
3445
3446
3447 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3448 {
3449         DPSOFTRAST_VertexShader_LightDirection();
3450         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3451 }
3452
3453 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3454 {
3455         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3456 }
3457
3458
3459
3460 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3461 {
3462         DPSOFTRAST_VertexShader_LightDirection();
3463         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3464 }
3465
3466 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3467 {
3468         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3469 }
3470
3471
3472
3473 void DPSOFTRAST_VertexShader_LightDirection(void)
3474 {
3475         int i;
3476         int numvertices = dpsoftrast.numvertices;
3477         float LightDir[4];
3478         float LightVector[4];
3479         float EyePosition[4];
3480         float EyeVectorModelSpace[4];
3481         float EyeVector[4];
3482         float position[4];
3483         float svector[4];
3484         float tvector[4];
3485         float normal[4];
3486         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3487         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3488         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3489         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3490         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3491         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3492         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3493         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3494         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3495         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3496         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3497         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3498         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3499         for (i = 0;i < numvertices;i++)
3500         {
3501                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3502                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3503                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3504                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3505                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3506                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3507                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3508                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3509                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3510                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3511                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3512                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3513                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3514                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3515                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3516                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3517                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3518                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3519                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3520                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3521                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3522                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3523                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3524                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3525                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3526                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3527                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3528                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3529                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3530         }
3531         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3532 }
3533
3534 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3535 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3536 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3537 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3538 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3539 #define DPSOFTRAST_Vector3Normalize(v)\
3540 do\
3541 {\
3542         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3543         if (len)\
3544         {\
3545                 len = 1.0f / len;\
3546                 v[0] *= len;\
3547                 v[1] *= len;\
3548                 v[2] *= len;\
3549         }\
3550 }\
3551 while(0)
3552
3553 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3554 {
3555         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3556         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3557         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3558         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3559         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3560         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3561         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3562         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3563         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3564         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3565         int x, startx = span->startx, endx = span->endx;
3566         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3567         float LightVectordata[4];
3568         float LightVectorslope[4];
3569         float EyeVectordata[4];
3570         float EyeVectorslope[4];
3571         float VectorSdata[4];
3572         float VectorSslope[4];
3573         float VectorTdata[4];
3574         float VectorTslope[4];
3575         float VectorRdata[4];
3576         float VectorRslope[4];
3577         float z;
3578         float diffusetex[4];
3579         float glosstex[4];
3580         float surfacenormal[4];
3581         float lightnormal[4];
3582         float lightnormal_modelspace[4];
3583         float eyenormal[4];
3584         float specularnormal[4];
3585         float diffuse;
3586         float specular;
3587         float SpecularPower;
3588         int d[4];
3589         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3590         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3591         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3592         Color_Glow[3] = 0.0f;
3593         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3594         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3595         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3596         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3597         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3598         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3599         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3600         Color_Pants[3] = 0.0f;
3601         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3602         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3603         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3604         Color_Shirt[3] = 0.0f;
3605         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3606         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3607         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3608         {
3609                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3610                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3611         }
3612         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3613         {
3614                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3615         }
3616         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3617         {
3618                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3619                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3620                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3621                 Color_Diffuse[3] = 0.0f;
3622                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3623                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3624                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3625                 LightColor[3] = 0.0f;
3626                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3627                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3628                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3629                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3630                 Color_Specular[3] = 0.0f;
3631                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3632                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3633                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3634
3635                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3636                 {
3637                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3638                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3639                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3640                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3641                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3642                 }
3643                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3644                 {
3645                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3646                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3647                 }
3648                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3649                 {
3650                         // nothing of this needed
3651                 }
3652                 else
3653                 {
3654                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3655                 }
3656
3657                 for (x = startx;x < endx;x++)
3658                 {
3659                         z = buffer_z[x];
3660                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3661                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3662                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3663                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3664                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3665                         {
3666                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3667                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3668                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3669                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3670                         }
3671                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3672                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3673                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3674                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3675                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3676                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3677                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3678                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3679
3680                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3681                         {
3682                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3683                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3684                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3685                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3686
3687                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3688                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3689                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3690                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3691
3692                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3693                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3694                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3695                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3696
3697                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3698                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3699                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3700                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3701
3702                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3703                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3704
3705                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3706                                 {
3707                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3708                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3709                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3710                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3711                                 }
3712                         }
3713                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3714                         {
3715                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3716                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3717                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3718                                 {
3719                                         float f = 1.0f / 256.0f;
3720                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3721                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3722                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3723                                 }
3724                         }
3725                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3726                         {
3727                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3728                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3729                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3730                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3731
3732                                 LightColor[0] = 1.0;
3733                                 LightColor[1] = 1.0;
3734                                 LightColor[2] = 1.0;
3735                         }
3736                         else
3737                         {
3738                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3739                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3740                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3741                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3742                         }
3743
3744                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3745
3746                         if(thread->shader_exactspecularmath)
3747                         {
3748                                 // reflect lightnormal at surfacenormal, take the negative of that
3749                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3750                                 float f;
3751                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3752                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3753                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3754                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3755
3756                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3757                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3758                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3759                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3760                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3761
3762                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3763                         }
3764                         else
3765                         {
3766                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3767                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3768                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3769                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3770
3771                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3772                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3773                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3774                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3775
3776                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3777                         }
3778
3779                         specular = pow(specular, SpecularPower * glosstex[3]);
3780                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3781                         {
3782                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3783                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3784                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3785                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3786                         }
3787                         else
3788                         {
3789                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3790                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3791                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3792                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3793                         }
3794
3795                         buffer_FragColorbgra8[x*4+0] = d[0];
3796                         buffer_FragColorbgra8[x*4+1] = d[1];
3797                         buffer_FragColorbgra8[x*4+2] = d[2];
3798                         buffer_FragColorbgra8[x*4+3] = d[3];
3799                 }
3800         }
3801         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3802         {
3803                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3804                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3805                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3806                 Color_Diffuse[3] = 0.0f;
3807                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3808                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3809                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3810                 LightColor[3] = 0.0f;
3811                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3812
3813                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3814                 {
3815                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3816                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3817                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3818                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3819                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3820                 }
3821                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3822                 {
3823                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3824                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3825                 }
3826                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3827                 {
3828                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3829                 }
3830                 else
3831                 {
3832                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3833                 }
3834
3835                 for (x = startx;x < endx;x++)
3836                 {
3837                         z = buffer_z[x];
3838                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3839                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3840                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3841                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3842                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3843                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3844                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3845                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3846
3847                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3848                         {
3849                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3850                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3851                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3852                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3853
3854                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3855                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3856                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3857                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3858
3859                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3860                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3861                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3862                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3863
3864                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3865                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3866                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3867                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3868
3869                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3870                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3871
3872                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3873                                 {
3874                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3875                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3876                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3877                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3878                                 }
3879                         }
3880                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3881                         {
3882                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3883                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3884                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3885                                 {
3886                                         float f = 1.0f / 256.0f;
3887                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3888                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3889                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3890                                 }
3891                         }
3892                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3893                         {
3894                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3895                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3896                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3897                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3898
3899                                 LightColor[0] = 1.0;
3900                                 LightColor[1] = 1.0;
3901                                 LightColor[2] = 1.0;
3902                         }
3903                         else
3904                         {
3905                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3906                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3907                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3908                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3909                         }
3910
3911                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3912                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3913                         {
3914                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3915                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3916                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3917                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3918                         }
3919                         else
3920                         {
3921                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3922                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3923                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3924                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3925                         }
3926                         buffer_FragColorbgra8[x*4+0] = d[0];
3927                         buffer_FragColorbgra8[x*4+1] = d[1];
3928                         buffer_FragColorbgra8[x*4+2] = d[2];
3929                         buffer_FragColorbgra8[x*4+3] = d[3];
3930                 }
3931         }
3932         else
3933         {
3934                 for (x = startx;x < endx;x++)
3935                 {
3936                         z = buffer_z[x];
3937                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3938                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3939                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3940                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3941
3942                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3943                         {
3944                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3945                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3946                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3947                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3948                         }
3949                         else
3950                         {
3951                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3952                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3953                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3954                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3955                         }
3956                         buffer_FragColorbgra8[x*4+0] = d[0];
3957                         buffer_FragColorbgra8[x*4+1] = d[1];
3958                         buffer_FragColorbgra8[x*4+2] = d[2];
3959                         buffer_FragColorbgra8[x*4+3] = d[3];
3960                 }
3961         }
3962         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3963 }
3964
3965
3966
3967 void DPSOFTRAST_VertexShader_LightSource(void)
3968 {
3969         int i;
3970         int numvertices = dpsoftrast.numvertices;
3971         float LightPosition[4];
3972         float LightVector[4];
3973         float LightVectorModelSpace[4];
3974         float EyePosition[4];
3975         float EyeVectorModelSpace[4];
3976         float EyeVector[4];
3977         float position[4];
3978         float svector[4];
3979         float tvector[4];
3980         float normal[4];
3981         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3982         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3983         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3984         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3985         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3986         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3987         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3988         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3989         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3990         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3991         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3992         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3993         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3994         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3995         for (i = 0;i < numvertices;i++)
3996         {
3997                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3998                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3999                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4000                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4001                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4002                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4003                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4004                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4005                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4006                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4007                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4008                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4009                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4010                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4011                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4012                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4013                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4014                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4015                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4016                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4017                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4018                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4019                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4020                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4021                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4022                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4023                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4024                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4025                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4026                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4027                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4028                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4029         }
4030         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4031         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4032 }
4033
4034 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4035 {
4036 #ifdef SSE_POSSIBLE
4037         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4038         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4039         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4040         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4041         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4042         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4043         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4044         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4045         int x, startx = span->startx, endx = span->endx;
4046         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4047         float CubeVectordata[4];
4048         float CubeVectorslope[4];
4049         float LightVectordata[4];
4050         float LightVectorslope[4];
4051         float EyeVectordata[4];
4052         float EyeVectorslope[4];
4053         float z;
4054         float diffusetex[4];
4055         float glosstex[4];
4056         float surfacenormal[4];
4057         float lightnormal[4];
4058         float eyenormal[4];
4059         float specularnormal[4];
4060         float diffuse;
4061         float specular;
4062         float SpecularPower;
4063         float CubeVector[4];
4064         float attenuation;
4065         int d[4];
4066         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4067         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4068         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4069         Color_Glow[3] = 0.0f;
4070         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4071         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4072         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4073         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4074         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4075         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4076         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4077         Color_Diffuse[3] = 0.0f;
4078         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4079         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4080         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4081         Color_Specular[3] = 0.0f;
4082         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4083         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4084         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4085         Color_Pants[3] = 0.0f;
4086         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4087         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4088         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4089         Color_Shirt[3] = 0.0f;
4090         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4091         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4092         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4093         LightColor[3] = 0.0f;
4094         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4095         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4096         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4097         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4098         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4099         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4100         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4101         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4102         {
4103                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4104                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4105         }
4106         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4107                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4108         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4109         {
4110                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4111                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4112                 for (x = startx;x < endx;x++)
4113                 {
4114                         z = buffer_z[x];
4115                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4116                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4117                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4118                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4119                         if (attenuation < 0.01f)
4120                                 continue;
4121                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4122                         {
4123                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4124                                 if (attenuation < 0.01f)
4125                                         continue;
4126                         }
4127
4128                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4129                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4130                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4131                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4132                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4133                         {
4134                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4135                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4136                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4137                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4138                         }
4139                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4140                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4141                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4142                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4143                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4144                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4145                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4146                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4147
4148                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4149                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4150                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4151                         DPSOFTRAST_Vector3Normalize(lightnormal);
4152
4153                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4154
4155                         if(thread->shader_exactspecularmath)
4156                         {
4157                                 // reflect lightnormal at surfacenormal, take the negative of that
4158                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4159                                 float f;
4160                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4161                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4162                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4163                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4164
4165                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4166                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4167                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4168                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4169                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4170
4171                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4172                         }
4173                         else
4174                         {
4175                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4176                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4177                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4178                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4179
4180                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4181                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4182                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4183                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4184
4185                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4186                         }
4187                         specular = pow(specular, SpecularPower * glosstex[3]);
4188
4189                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4190                         {
4191                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4192                                 attenuation *= (1.0f / 255.0f);
4193                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4194                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4195                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4196                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4197                         }
4198                         else
4199                         {
4200                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4201                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4202                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4203                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4204                         }
4205                         buffer_FragColorbgra8[x*4+0] = d[0];
4206                         buffer_FragColorbgra8[x*4+1] = d[1];
4207                         buffer_FragColorbgra8[x*4+2] = d[2];
4208                         buffer_FragColorbgra8[x*4+3] = d[3];
4209                 }
4210         }
4211         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4212         {
4213                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4214                 for (x = startx;x < endx;x++)
4215                 {
4216                         z = buffer_z[x];
4217                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4218                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4219                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4220                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4221                         if (attenuation < 0.01f)
4222                                 continue;
4223                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4224                         {
4225                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4226                                 if (attenuation < 0.01f)
4227                                         continue;
4228                         }
4229
4230                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4231                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4232                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4233                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4234                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4235                         {
4236                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4237                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4238                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4239                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4240                         }
4241                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4242                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4243                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4244                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4245
4246                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4247                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4248                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4249                         DPSOFTRAST_Vector3Normalize(lightnormal);
4250
4251                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4252                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4253                         {
4254                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4255                                 attenuation *= (1.0f / 255.0f);
4256                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4257                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4258                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4259                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4260                         }
4261                         else
4262                         {
4263                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4264                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4265                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4266                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4267                         }
4268                         buffer_FragColorbgra8[x*4+0] = d[0];
4269                         buffer_FragColorbgra8[x*4+1] = d[1];
4270                         buffer_FragColorbgra8[x*4+2] = d[2];
4271                         buffer_FragColorbgra8[x*4+3] = d[3];
4272                 }
4273         }
4274         else
4275         {
4276                 for (x = startx;x < endx;x++)
4277                 {
4278                         z = buffer_z[x];
4279                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4280                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4281                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4282                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4283                         if (attenuation < 0.01f)
4284                                 continue;
4285                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4286                         {
4287                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4288                                 if (attenuation < 0.01f)
4289                                         continue;
4290                         }
4291
4292                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4293                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4294                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4295                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4296                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4297                         {
4298                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4299                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4300                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4301                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4302                         }
4303                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4304                         {
4305                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4306                                 attenuation *= (1.0f / 255.0f);
4307                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4308                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4309                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4310                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4311                         }
4312                         else
4313                         {
4314                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4315                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4316                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4317                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4318                         }
4319                         buffer_FragColorbgra8[x*4+0] = d[0];
4320                         buffer_FragColorbgra8[x*4+1] = d[1];
4321                         buffer_FragColorbgra8[x*4+2] = d[2];
4322                         buffer_FragColorbgra8[x*4+3] = d[3];
4323                 }
4324         }
4325         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4326 #endif
4327 }
4328
4329
4330
4331 void DPSOFTRAST_VertexShader_Refraction(void)
4332 {
4333         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4334         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4335         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4336 }
4337
4338 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4339 {
4340         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4341
4342         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4343         float z;
4344         int x, startx = span->startx, endx = span->endx;
4345
4346         // texture reads
4347         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4348         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4349
4350         // varyings
4351         float ModelViewProjectionPositiondata[4];
4352         float ModelViewProjectionPositionslope[4];
4353
4354         // uniforms
4355         float ScreenScaleRefractReflect[2];
4356         float ScreenCenterRefractReflect[2];
4357         float DistortScaleRefractReflect[2];
4358         float RefractColor[4];
4359
4360         const unsigned char * RESTRICT pixelbase;
4361         const unsigned char * RESTRICT pixel[4];
4362         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4363         if(!texture) return;
4364         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4365
4366         // read textures
4367         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4368         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4369
4370         // read varyings
4371         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4372
4373         // read uniforms
4374         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4375         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4376         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4377         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4378         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4379         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4380         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4381         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4382         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4383         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4384
4385         // do stuff
4386         for (x = startx;x < endx;x++)
4387         {
4388                 float SafeScreenTexCoord[2];
4389                 float ScreenTexCoord[2];
4390                 float v[3];
4391                 float iw;
4392                 unsigned char c[4];
4393
4394                 z = buffer_z[x];
4395
4396                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4397                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4398                 
4399                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4400                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4401                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4402
4403                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4404                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4405                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4406                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4407                 DPSOFTRAST_Vector3Normalize(v);
4408                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4409                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4410
4411                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4412                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4413                 {
4414                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<12) - 2048, ScreenTexCoord[1] * (texture->mipmap[0][3]<<12) - 2048};
4415                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4416                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4417                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4418                         int tci[2] = { tc[0]>>12, tc[1]>>12 };
4419                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4420                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4421                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4422                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4423                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4424                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4425                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4426                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4427                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4428                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4429                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4430                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4431                 }
4432                 else
4433                 {
4434                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2], ScreenTexCoord[1] * texture->mipmap[0][3] };
4435                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4436                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4437                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4438                         c[0] = pixel[0][0];
4439                         c[1] = pixel[0][1];
4440                         c[2] = pixel[0][2];
4441                 }
4442
4443                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4444                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4445                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4446                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4447                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4448         }
4449
4450         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4451 }
4452
4453
4454
4455 void DPSOFTRAST_VertexShader_Water(void)
4456 {
4457         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4458 }
4459
4460
4461 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4462 {
4463         // TODO: IMPLEMENT
4464         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4465         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4466         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4467         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4468         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4469 }
4470
4471
4472
4473 void DPSOFTRAST_VertexShader_ShowDepth(void)
4474 {
4475         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4476 }
4477
4478 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4479 {
4480         // TODO: IMPLEMENT
4481         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4482         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4483         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4484         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4485         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4486 }
4487
4488
4489
4490 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4491 {
4492         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4493 }
4494
4495 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4496 {
4497         // TODO: IMPLEMENT
4498         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4499         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4500         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4501         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4502         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4503 }
4504
4505
4506
4507 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4508 {
4509         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4510 }
4511
4512 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4513 {
4514         // TODO: IMPLEMENT
4515         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4516         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4517         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4518         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4519         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4520 }
4521
4522
4523
4524 typedef struct DPSOFTRAST_ShaderModeInfo_s
4525 {
4526         int lodarrayindex;
4527         void (*Vertex)(void);
4528         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4529         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4530         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4531 }
4532 DPSOFTRAST_ShaderModeInfo;
4533
4534 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4535 {
4536         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4537         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4538         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4539         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4540         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4541         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4542         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4543         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4544         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4545         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4546         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4547         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4548         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4549         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4550         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4551         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4552 };
4553
4554 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4555 {
4556         int x;
4557         int startx;
4558         int endx;
4559         unsigned int *depthpixel;
4560         int depth;
4561         int depthslope;
4562         unsigned int d;
4563         unsigned char *pixelmask;
4564         DPSOFTRAST_State_Triangle *triangle;
4565         triangle = &thread->triangles[span->triangle];
4566         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4567         startx = span->startx;
4568         endx = span->endx;
4569         depth = span->depthbase;
4570         depthslope = span->depthslope;
4571         pixelmask = thread->pixelmaskarray;
4572         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4573         {
4574                 switch(thread->fb_depthfunc)
4575                 {
4576                 default:
4577                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4578                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4579                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4580                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4581                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4582                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4583                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4584                 }
4585                 while (startx < endx && !pixelmask[startx])
4586                         startx++;
4587                 while (endx > startx && !pixelmask[endx-1])
4588                         endx--;
4589         }
4590         else
4591         {
4592                 // no depth testing means we're just dealing with color...
4593                 memset(pixelmask + startx, 1, endx - startx);
4594         }
4595         span->pixelmask = pixelmask;
4596         span->startx = startx;
4597         span->endx = endx;
4598 }
4599
4600 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4601 {
4602         int x, d, depth, depthslope, startx, endx;
4603         const unsigned char *pixelmask;
4604         unsigned int *depthpixel;
4605         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4606         {
4607                 depth = span->depthbase;
4608                 depthslope = span->depthslope;
4609                 pixelmask = span->pixelmask;
4610                 startx = span->startx;
4611                 endx = span->endx;
4612                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4613                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4614                         if (pixelmask[x])
4615                                 depthpixel[x] = d;
4616         }
4617 }
4618
4619 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4620 {
4621         int i;
4622         DPSOFTRAST_State_Triangle *triangle;
4623         DPSOFTRAST_State_Span *span;
4624         for (i = 0; i < thread->numspans; i++)
4625         {
4626                 span = &thread->spans[i];
4627                 triangle = &thread->triangles[span->triangle];
4628                 DPSOFTRAST_Draw_DepthTest(thread, span);
4629                 if (span->startx >= span->endx)
4630                         continue;
4631                 // run pixel shader if appropriate
4632                 // do this before running depthmask code, to allow the pixelshader
4633                 // to clear pixelmask values for alpha testing
4634                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4635                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4636                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4637         }
4638         thread->numspans = 0;
4639 }
4640
4641 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4642
4643 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4644 {
4645 #ifdef SSE_POSSIBLE
4646         int cullface = thread->cullface;
4647         int minx, maxx, miny, maxy;
4648         int miny1, maxy1, miny2, maxy2;
4649         __m128i fbmin, fbmax;
4650         __m128 viewportcenter, viewportscale;
4651         int firstvertex = command->firstvertex;
4652         int numvertices = command->numvertices;
4653         int numtriangles = command->numtriangles;
4654         const int *element3i = command->element3i;
4655         const unsigned short *element3s = command->element3s;
4656         int clipped = command->clipped;
4657         int i;
4658         int j;
4659         int k;
4660         int y;
4661         int e[3];
4662         __m128i screeny;
4663         int starty, endy, bandy;
4664         int numpoints;
4665         int clipcase;
4666         float clipdist[4];
4667         float clip0origin, clip0slope;
4668         int clip0dir;
4669         __m128 triangleedge1, triangleedge2, trianglenormal;
4670         __m128 clipfrac[3];
4671         __m128 screen[4];
4672         DPSOFTRAST_State_Triangle *triangle;
4673         DPSOFTRAST_Texture *texture;
4674         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4675         miny = thread->fb_scissor[1];
4676         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4677         miny1 = bound(miny, thread->miny1, maxy);
4678         maxy1 = bound(miny, thread->maxy1, maxy);
4679         miny2 = bound(miny, thread->miny2, maxy);
4680         maxy2 = bound(miny, thread->maxy2, maxy);
4681         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4682         {
4683                 if (!ATOMIC_DECREMENT(command->refcount))
4684                 {
4685                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4686                                 MM_FREE(command->arrays);
4687                 }
4688                 return;
4689         }
4690         minx = thread->fb_scissor[0];
4691         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4692         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4693         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4694         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4695         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4696         screen[3] = _mm_setzero_ps();
4697         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4698         for (i = 0;i < numtriangles;i++)
4699         {
4700                 const float *screencoord4f = command->arrays;
4701                 const float *arrays = screencoord4f + numvertices*4;
4702
4703                 // generate the 3 edges of this triangle
4704                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4705                 if (element3s)
4706                 {
4707                         e[0] = element3s[i*3+0] - firstvertex;
4708                         e[1] = element3s[i*3+1] - firstvertex;
4709                         e[2] = element3s[i*3+2] - firstvertex;
4710                 }
4711                 else if (element3i)
4712                 {
4713                         e[0] = element3i[i*3+0] - firstvertex;
4714                         e[1] = element3i[i*3+1] - firstvertex;
4715                         e[2] = element3i[i*3+2] - firstvertex;
4716                 }
4717                 else
4718                 {
4719                         e[0] = i*3+0;
4720                         e[1] = i*3+1;
4721                         e[2] = i*3+2;
4722                 }
4723
4724 #define SKIPBACKFACE \
4725                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4726                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4727                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4728                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4729                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4730                 switch(cullface) \
4731                 { \
4732                 case GL_BACK: \
4733                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4734                                 continue; \
4735                         break; \
4736                 case GL_FRONT: \
4737                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4738                                 continue; \
4739                         break; \
4740                 }
4741
4742 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4743                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4744                         { \
4745                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4746                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4747                         }
4748 #define CLIPPEDVERTEXCOPY(k,p1) \
4749                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4750
4751 #define GENATTRIBCOPY(attrib, p1) \
4752                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4753 #define GENATTRIBLERP(attrib, p1, p2) \
4754                 { \
4755                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4756                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4757                 }
4758 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4759                 switch(clipcase) \
4760                 { \
4761                 default: \
4762                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4763                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4764                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4765                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4766                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4767                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4768                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4769                 }
4770
4771                 if (! clipped)
4772                         goto notclipped;
4773
4774                 // calculate distance from nearplane
4775                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4776                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4777                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4778                 if (clipdist[0] >= 0.0f)
4779                 {
4780                         if (clipdist[1] >= 0.0f)
4781                         {
4782                                 if (clipdist[2] >= 0.0f)
4783                                 {
4784                                 notclipped:
4785                                         // triangle is entirely in front of nearplane
4786                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4787                                         SKIPBACKFACE;
4788                                         numpoints = 3;
4789                                         clipcase = 0;
4790                                 }
4791                                 else
4792                                 {
4793                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4794                                         SKIPBACKFACE;
4795                                         numpoints = 4;
4796                                         clipcase = 1;
4797                                 }
4798                         }
4799                         else
4800                         {
4801                                 if (clipdist[2] >= 0.0f)
4802                                 {
4803                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4804                                         SKIPBACKFACE;
4805                                         numpoints = 4;
4806                                         clipcase = 2;
4807                                 }
4808                                 else
4809                                 {
4810                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4811                                         SKIPBACKFACE;
4812                                         numpoints = 3;
4813                                         clipcase = 3;
4814                                 }
4815                         }
4816                 }
4817                 else if (clipdist[1] >= 0.0f)
4818                 {
4819                         if (clipdist[2] >= 0.0f)
4820                         {
4821                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4822                                 SKIPBACKFACE;
4823                                 numpoints = 4;
4824                                 clipcase = 4;
4825                         }
4826                         else
4827                         {
4828                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4829                                 SKIPBACKFACE;
4830                                 numpoints = 3;
4831                                 clipcase = 5;
4832                         }
4833                 }
4834                 else if (clipdist[2] >= 0.0f)
4835                 {
4836                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4837                         SKIPBACKFACE;
4838                         numpoints = 3;
4839                         clipcase = 6;
4840                 }
4841                 else continue; // triangle is entirely behind nearplane
4842
4843                 {
4844                         // calculate integer y coords for triangle points
4845                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4846                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4847                                         screenmin = _mm_min_epi16(screeni, screenir),
4848                                         screenmax = _mm_max_epi16(screeni, screenir);
4849                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4850                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4851                         screenmin = _mm_max_epi16(screenmin, fbmin);
4852                         screenmax = _mm_min_epi16(screenmax, fbmax);
4853                         // skip offscreen triangles
4854                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4855                                 continue;
4856                         starty = _mm_extract_epi16(screenmin, 1);
4857                         endy = _mm_extract_epi16(screenmax, 1)+1;
4858                         if (starty >= maxy1 && endy <= miny2)
4859                                 continue;
4860                         screeny = _mm_srai_epi32(screeni, 16);
4861                 }
4862
4863                 triangle = &thread->triangles[thread->numtriangles];
4864
4865                 // calculate attribute plans for triangle data...
4866                 // okay, this triangle is going to produce spans, we'd better project
4867                 // the interpolants now (this is what gives perspective texturing),
4868                 // this consists of simply multiplying all arrays by the W coord
4869                 // (which is basically 1/Z), which will be undone per-pixel
4870                 // (multiplying by Z again) to get the perspective-correct array
4871                 // values
4872                 {
4873                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4874                         __m128 mipedgescale, mipdensity;
4875                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4876                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4877                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4878                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4879                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4880                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4881                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4882                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4883                         attribedge1 = _mm_sub_ss(w0, w1);
4884                         attribedge2 = _mm_sub_ss(w2, w1);
4885                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4886                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4887                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4888                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4889                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4890                         _mm_store_ss(&triangle->w[0], attribxslope);
4891                         _mm_store_ss(&triangle->w[1], attribyslope);
4892                         _mm_store_ss(&triangle->w[2], attriborigin);
4893                         
4894                         clip0origin = 0;
4895                         clip0slope = 0;
4896                         clip0dir = 0;
4897                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
4898                         {
4899                                 float cliporigin, clipxslope, clipyslope;
4900                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
4901                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4902                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
4903                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4904                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4905                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4906                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
4907                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
4908                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
4909                                 if(clipxslope != 0)
4910                                 {
4911                                         clip0origin = -cliporigin/clipxslope;
4912                                         clip0slope = -clipyslope/clipxslope;
4913                                         clip0dir = clipxslope > 0 ? 1 : -1;
4914                                 }
4915                                 else if(clipyslope > 0)
4916                                 {
4917                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
4918                                         clip0slope = dpsoftrast.fb_width;
4919                                         clip0dir = -1;
4920                                 }
4921                                 else if(clipyslope < 0)
4922                                 {
4923                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
4924                                         clip0slope = -dpsoftrast.fb_width;
4925                                         clip0dir = -1;
4926                                 }
4927                                 else if(clip0origin < 0) continue;
4928                         }
4929
4930                         mipedgescale = _mm_setzero_ps();
4931                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4932                         {
4933                                 __m128 attrib0, attrib1, attrib2;
4934                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4935                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4936                                         break;
4937                                 arrays += numvertices*4;
4938                                 GENATTRIBS(attrib0, attrib1, attrib2);
4939                                 attriborigin = _mm_mul_ps(attrib1, w1);
4940                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4941                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4942                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4943                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4944                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4945                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4946                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4947                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4948                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4949                                 {
4950                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4951                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4952                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4953                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4954                                 }
4955                         }
4956
4957                         memset(triangle->mip, 0, sizeof(triangle->mip));
4958                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4959                         {
4960                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4961                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4962                                         break;
4963                                 texture = thread->texbound[texunit];
4964                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4965                                 {
4966                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4967                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4968                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4969                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4970                                         // this will be multiplied in the texturing routine by the texture resolution
4971                                         y = _mm_cvtss_si32(mipdensity);
4972                                         if (y > 0)
4973                                         {
4974                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4975                                                 if (y > texture->mipmaps - 1)
4976                                                         y = texture->mipmaps - 1;
4977                                                 triangle->mip[texunit] = y;
4978                                         }
4979                                 }
4980                         }
4981                 }
4982         
4983                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4984                 for (; y < bandy;)
4985                 {
4986                         __m128 xcoords, xslope;
4987                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4988                         int yccmask = _mm_movemask_epi8(ycc);
4989                         int edge0p, edge0n, edge1p, edge1n;
4990                         int nexty;
4991                         float w, wslope;
4992                         float clip0;
4993                         if (numpoints == 4)
4994                         {
4995                                 switch(yccmask)
4996                                 {
4997                                 default:
4998                                 case 0xFFFF: /*0000*/ y = endy; continue;
4999                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5000                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5001                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5002                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5003                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5004                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5005                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5006                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5007                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5008                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5009                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5010                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5011                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5012                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5013                                 case 0x0000: /*1111*/ y++; continue;
5014                                 }
5015                         }
5016                         else
5017                         {
5018                                 switch(yccmask)
5019                                 {
5020                                 default:
5021                                 case 0xFFFF: /*000*/ y = endy; continue;
5022                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5023                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5024                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5025                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5026                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5027                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5028                                 case 0x0000: /*111*/ y++; continue;
5029                                 }
5030                         }
5031                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5032                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5033                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5034                         nexty = _mm_extract_epi16(ycc, 0);
5035                         if (nexty >= bandy) nexty = bandy-1;
5036                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5037                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5038                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5039                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5040                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5041                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5042                         {
5043                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5044                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5045                         }
5046                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5047                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5048                         {
5049                                 int startx, endx, offset;
5050                                 startx = _mm_cvtss_si32(xcoords);
5051                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5052                                 if (startx < minx) startx = minx;
5053                                 if (endx > maxx) endx = maxx;
5054                                 if (startx >= endx) continue;
5055
5056                                 if (clip0dir)
5057                                 {
5058                                         if (clip0dir > 0)
5059                                         {
5060                                                 if (startx < clip0) 
5061                                                 {
5062                                                         if(endx <= clip0) continue;
5063                                                         startx = (int)clip0;
5064                                                 }
5065                                         }
5066                                         else if (endx > clip0) 
5067                                         {
5068                                                 if(startx >= clip0) continue;
5069                                                 endx = (int)clip0;
5070                                         }
5071                                 }
5072                                                 
5073                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5074                                 {
5075                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5076                                         span->triangle = thread->numtriangles;
5077                                         span->x = offset;
5078                                         span->y = y;
5079                                         span->startx = 0;
5080                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5081                                         if (span->startx >= span->endx)
5082                                                 continue;
5083                                         wslope = triangle->w[0];
5084                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5085                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5086                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5087                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5088                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5089                                 }
5090                         }
5091                 }
5092
5093                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5094                 {
5095                         DPSOFTRAST_Draw_ProcessSpans(thread);
5096                         thread->numtriangles = 0;
5097                 }
5098         }
5099
5100         if (!ATOMIC_DECREMENT(command->refcount))
5101         {
5102                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5103                         MM_FREE(command->arrays);
5104         }
5105
5106         if (thread->numspans > 0 || thread->numtriangles > 0)
5107         {
5108                 DPSOFTRAST_Draw_ProcessSpans(thread);
5109                 thread->numtriangles = 0;
5110         }
5111 #endif
5112 }
5113
5114 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5115 {
5116         int i;
5117         int j;
5118         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5119         int datasize = 2*numvertices*sizeof(float[4]);
5120         DPSOFTRAST_Command_Draw *command;
5121         unsigned char *data;
5122         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5123         {
5124                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5125                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5126                         break;
5127                 datasize += numvertices*sizeof(float[4]);
5128         }
5129         if (element3s)
5130                 datasize += numtriangles*sizeof(unsigned short[3]);
5131         else if (element3i)
5132                 datasize += numtriangles*sizeof(int[3]);
5133         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5134         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5135         {
5136                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5137                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5138         }
5139         else
5140         {
5141                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5142                 data = (unsigned char *)command + commandsize;
5143         }
5144         command->firstvertex = firstvertex;
5145         command->numvertices = numvertices;
5146         command->numtriangles = numtriangles;
5147         command->arrays = (float *)data;
5148         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5149         dpsoftrast.firstvertex = firstvertex;
5150         dpsoftrast.numvertices = numvertices;
5151         dpsoftrast.screencoord4f = (float *)data;
5152         data += numvertices*sizeof(float[4]);
5153         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5154         data += numvertices*sizeof(float[4]);
5155         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5156         {
5157                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5158                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5159                         break;
5160                 dpsoftrast.post_array4f[j] = (float *)data;
5161                 data += numvertices*sizeof(float[4]);
5162         }
5163         command->element3i = NULL;
5164         command->element3s = NULL;
5165         if (element3s)
5166         {
5167                 command->element3s = (unsigned short *)data;
5168                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5169         }
5170         else if (element3i)
5171         {
5172                 command->element3i = (int *)data;
5173                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5174         }
5175         return command;
5176 }
5177
5178 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5179 {
5180         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5181         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5182         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5183         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5184         if (command->starty >= command->endy)
5185         {
5186                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5187                         MM_FREE(command->arrays);
5188                 DPSOFTRAST_UndoCommand(command->commandsize);
5189                 return;
5190         }
5191         command->clipped = dpsoftrast.drawclipped;
5192         command->refcount = dpsoftrast.numthreads;
5193
5194         if (dpsoftrast.usethreads)
5195         {
5196                 int i;
5197                 DPSOFTRAST_Draw_SyncCommands();
5198                 for (i = 0; i < dpsoftrast.numthreads; i++)
5199                 {
5200                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5201                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5202                                 Thread_CondSignal(thread->drawcond);
5203                 }
5204         }
5205         else
5206         {
5207                 DPSOFTRAST_Draw_FlushThreads();
5208         }
5209 }
5210
5211 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5212 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5213 {
5214         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5215 }
5216 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5217 {
5218         DPSOFTRAST_Command_SetRenderTargets *command;
5219         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5220                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5221                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5222                 DPSOFTRAST_Flush();
5223         dpsoftrast.fb_width = width;
5224         dpsoftrast.fb_height = height;
5225         dpsoftrast.fb_depthpixels = depthpixels;
5226         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5227         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5228         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5229         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5230         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5231         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5232         command->width = width;
5233         command->height = height;
5234 }
5235  
5236 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5237 {
5238         int commandoffset = thread->commandoffset;
5239         while (commandoffset != endoffset)
5240         {
5241                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5242                 switch (command->opcode)
5243                 {
5244 #define INTERPCOMMAND(name) \
5245                 case DPSOFTRAST_OPCODE_##name : \
5246                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5247                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5248                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5249                                 commandoffset = 0; \
5250                         break;
5251                 INTERPCOMMAND(Viewport)
5252                 INTERPCOMMAND(ClearColor)
5253                 INTERPCOMMAND(ClearDepth)
5254                 INTERPCOMMAND(ColorMask)
5255                 INTERPCOMMAND(DepthTest)
5256                 INTERPCOMMAND(ScissorTest)
5257                 INTERPCOMMAND(Scissor)
5258                 INTERPCOMMAND(BlendFunc)
5259                 INTERPCOMMAND(BlendSubtract)
5260                 INTERPCOMMAND(DepthMask)
5261                 INTERPCOMMAND(DepthFunc)
5262                 INTERPCOMMAND(DepthRange)
5263                 INTERPCOMMAND(PolygonOffset)
5264                 INTERPCOMMAND(CullFace)
5265                 INTERPCOMMAND(AlphaTest)
5266                 INTERPCOMMAND(AlphaFunc)
5267                 INTERPCOMMAND(SetTexture)
5268                 INTERPCOMMAND(SetShader)
5269                 INTERPCOMMAND(Uniform4f)
5270                 INTERPCOMMAND(UniformMatrix4f)
5271                 INTERPCOMMAND(Uniform1i)
5272                 INTERPCOMMAND(SetRenderTargets)
5273                 INTERPCOMMAND(ClipPlane)
5274
5275                 case DPSOFTRAST_OPCODE_Draw:
5276                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5277                         commandoffset += command->commandsize;
5278                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5279                                 commandoffset = 0;
5280                         thread->commandoffset = commandoffset;
5281                         break;
5282
5283                 case DPSOFTRAST_OPCODE_Reset:
5284                         commandoffset = 0;
5285                         break;
5286                 }
5287         }
5288         thread->commandoffset = commandoffset;
5289 }
5290
5291 static int DPSOFTRAST_Draw_Thread(void *data)
5292 {
5293         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5294         while(thread->index >= 0)
5295         {
5296                 if (thread->commandoffset != dpsoftrast.drawcommand)
5297                 {
5298                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5299                 }
5300                 else 
5301                 {
5302                         Thread_LockMutex(thread->drawmutex);
5303                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5304                         {
5305                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5306                                 thread->starving = true;
5307                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5308                                 thread->starving = false;
5309                         }
5310                         Thread_UnlockMutex(thread->drawmutex);
5311                 }
5312         }   
5313         return 0;
5314 }
5315
5316 static void DPSOFTRAST_Draw_FlushThreads(void)
5317 {
5318         DPSOFTRAST_State_Thread *thread;
5319         int i;
5320         DPSOFTRAST_Draw_SyncCommands();
5321         if (dpsoftrast.usethreads) 
5322         {
5323                 for (i = 0; i < dpsoftrast.numthreads; i++)
5324                 {
5325                         thread = &dpsoftrast.threads[i];
5326                         if (thread->commandoffset != dpsoftrast.drawcommand)
5327                         {
5328                                 Thread_LockMutex(thread->drawmutex);
5329                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5330                                         Thread_CondSignal(thread->drawcond);
5331                                 Thread_UnlockMutex(thread->drawmutex);
5332                         }
5333                 }
5334                 for (i = 0; i < dpsoftrast.numthreads; i++)
5335                 {
5336                         thread = &dpsoftrast.threads[i];
5337                         if (thread->commandoffset != dpsoftrast.drawcommand)
5338                         {
5339                                 Thread_LockMutex(thread->drawmutex);
5340                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5341                                 {
5342                                         thread->waiting = true;
5343                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5344                                         thread->waiting = false;
5345                                 }
5346                                 Thread_UnlockMutex(thread->drawmutex);
5347                         }
5348                 }
5349         }
5350         else
5351         {
5352                 for (i = 0; i < dpsoftrast.numthreads; i++)
5353                 {
5354                         thread = &dpsoftrast.threads[i];
5355                         if (thread->commandoffset != dpsoftrast.drawcommand)
5356                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5357                 }
5358         }
5359         dpsoftrast.commandpool.usedcommands = 0;
5360 }
5361
5362 void DPSOFTRAST_Flush(void)
5363 {
5364         DPSOFTRAST_Draw_FlushThreads();
5365 }
5366
5367 void DPSOFTRAST_Finish(void)
5368 {
5369         DPSOFTRAST_Flush();
5370 }
5371
5372 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5373 {
5374         int i;
5375         union
5376         {
5377                 int i;
5378                 unsigned char b[4];
5379         }
5380         u;
5381         u.i = 1;
5382         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5383         dpsoftrast.bigendian = u.b[3];
5384         dpsoftrast.fb_width = width;
5385         dpsoftrast.fb_height = height;
5386         dpsoftrast.fb_depthpixels = depthpixels;
5387         dpsoftrast.fb_colorpixels[0] = colorpixels;
5388         dpsoftrast.fb_colorpixels[1] = NULL;
5389         dpsoftrast.fb_colorpixels[1] = NULL;
5390         dpsoftrast.fb_colorpixels[1] = NULL;
5391         dpsoftrast.viewport[0] = 0;
5392         dpsoftrast.viewport[1] = 0;
5393         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5394         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5395         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5396         dpsoftrast.texture_firstfree = 1;
5397         dpsoftrast.texture_end = 1;
5398         dpsoftrast.texture_max = 0;
5399         dpsoftrast.color[0] = 1;
5400         dpsoftrast.color[1] = 1;
5401         dpsoftrast.color[2] = 1;
5402         dpsoftrast.color[3] = 1;
5403         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5404         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5405         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5406         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5407         for (i = 0; i < dpsoftrast.numthreads; i++)
5408         {
5409                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5410                 thread->index = i;
5411                 thread->cullface = GL_BACK;
5412         thread->colormask[0] = 1; 
5413                 thread->colormask[1] = 1;
5414                 thread->colormask[2] = 1;
5415                 thread->colormask[3] = 1;
5416                 thread->blendfunc[0] = GL_ONE;
5417                 thread->blendfunc[1] = GL_ZERO;
5418                 thread->depthmask = true;
5419                 thread->depthtest = true;
5420                 thread->depthfunc = GL_LEQUAL;
5421                 thread->scissortest = false;
5422                 thread->alphatest = false;
5423                 thread->alphafunc = GL_GREATER;
5424                 thread->alphavalue = 0.5f;
5425                 thread->viewport[0] = 0;
5426                 thread->viewport[1] = 0;
5427                 thread->viewport[2] = dpsoftrast.fb_width;
5428                 thread->viewport[3] = dpsoftrast.fb_height;
5429                 thread->scissor[0] = 0;
5430                 thread->scissor[1] = 0;
5431                 thread->scissor[2] = dpsoftrast.fb_width;
5432                 thread->scissor[3] = dpsoftrast.fb_height;
5433                 thread->depthrange[0] = 0;
5434                 thread->depthrange[1] = 1;
5435                 thread->polygonoffset[0] = 0;
5436                 thread->polygonoffset[1] = 0;
5437                 thread->clipplane[0] = 0;
5438                 thread->clipplane[1] = 0;
5439                 thread->clipplane[2] = 0;
5440                 thread->clipplane[3] = 1;
5441         
5442                 thread->numspans = 0;
5443                 thread->numtriangles = 0;
5444                 thread->commandoffset = 0;
5445                 thread->waiting = false;
5446                 thread->starving = false;
5447            
5448                 thread->validate = -1;
5449                 DPSOFTRAST_Validate(thread, -1);
5450  
5451                 if (dpsoftrast.usethreads)
5452                 {
5453                         thread->waitcond = Thread_CreateCond();
5454                         thread->drawcond = Thread_CreateCond();
5455                         thread->drawmutex = Thread_CreateMutex();
5456                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5457                 }
5458         }
5459         return 0;
5460 }
5461
5462 void DPSOFTRAST_Shutdown(void)
5463 {
5464         int i;
5465         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5466         {
5467                 DPSOFTRAST_State_Thread *thread;
5468                 for (i = 0; i < dpsoftrast.numthreads; i++)
5469                 {
5470                         thread = &dpsoftrast.threads[i];
5471                         Thread_LockMutex(thread->drawmutex);
5472                         thread->index = -1;
5473                         Thread_CondSignal(thread->drawcond);
5474                         Thread_UnlockMutex(thread->drawmutex);
5475                         Thread_WaitThread(thread->thread, 0);
5476                         Thread_DestroyCond(thread->waitcond);
5477                         Thread_DestroyCond(thread->drawcond);
5478                         Thread_DestroyMutex(thread->drawmutex);
5479                 }
5480         }
5481         for (i = 0;i < dpsoftrast.texture_end;i++)
5482                 if (dpsoftrast.texture[i].bytes)
5483                         MM_FREE(dpsoftrast.texture[i].bytes);
5484         if (dpsoftrast.texture)
5485                 free(dpsoftrast.texture);
5486         if (dpsoftrast.threads)
5487                 MM_FREE(dpsoftrast.threads);
5488         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5489 }
5490