]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
fix Vertex_BoundY params
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__APPLE__)
18                 #include <libkern/OSAtomic.h>
19                 #define ALIGN(var) var __attribute__((__aligned__(16)))
20                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21                 #define MEMORY_BARRIER (_mm_sfence())
22                 #define ATOMIC_COUNTER volatile int32_t 
23                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26         #elif defined(__GNUC__)
27                 #define ALIGN(var) var __attribute__((__aligned__(16)))
28                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(__sync_synchronize())
31                 #define ATOMIC_COUNTER volatile int
32                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35         #elif defined(_MSC_VER)
36                 #define ALIGN(var) __declspec(align(16)) var
37                 #define ATOMIC(var) __declspec(align(32)) var
38                 #define MEMORY_BARRIER (_mm_sfence())
39                 //(MemoryBarrier())
40                 #define ATOMIC_COUNTER volatile LONG
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
44         #endif
45 #endif
46
47 #ifndef ALIGN
48 #define ALIGN(var) var
49 #endif
50 #ifndef ATOMIC
51 #define ATOMIC(var) var
52 #endif
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
55 #endif
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
58 #endif
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
61 #endif
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
64 #endif
65 #ifndef ATOMIC_ADD
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
67 #endif
68
69 #ifdef SSE2_PRESENT
70 #include <emmintrin.h>
71
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
73
74 static void *MM_CALLOC(size_t nmemb, size_t size)
75 {
76         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77         if (ptr != NULL) memset(ptr, 0, nmemb*size);
78         return ptr;
79 }
80
81 #define MM_FREE _mm_free
82 #else
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
85 #define MM_FREE free
86 #endif
87
88 typedef enum DPSOFTRAST_ARRAY_e
89 {
90         DPSOFTRAST_ARRAY_POSITION,
91         DPSOFTRAST_ARRAY_COLOR,
92         DPSOFTRAST_ARRAY_TEXCOORD0,
93         DPSOFTRAST_ARRAY_TEXCOORD1,
94         DPSOFTRAST_ARRAY_TEXCOORD2,
95         DPSOFTRAST_ARRAY_TEXCOORD3,
96         DPSOFTRAST_ARRAY_TEXCOORD4,
97         DPSOFTRAST_ARRAY_TEXCOORD5,
98         DPSOFTRAST_ARRAY_TEXCOORD6,
99         DPSOFTRAST_ARRAY_TEXCOORD7,
100         DPSOFTRAST_ARRAY_TOTAL
101 }
102 DPSOFTRAST_ARRAY;
103
104 typedef struct DPSOFTRAST_Texture_s
105 {
106         int flags;
107         int width;
108         int height;
109         int depth;
110         int sides;
111         DPSOFTRAST_TEXTURE_FILTER filter;
112         int mipmaps;
113         int size;
114         ATOMIC_COUNTER binds;
115         unsigned char *bytes;
116         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
117 }
118 DPSOFTRAST_Texture;
119
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
122
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
124 {
125         unsigned char opcode;
126         unsigned short commandsize;
127 }
128 DPSOFTRAST_Command);
129
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
131
132 #define DEFCOMMAND(opcodeval, name, fields) \
133         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
135         { \
136                 unsigned char opcode; \
137                 unsigned short commandsize; \
138                 fields \
139         } DPSOFTRAST_Command_##name );
140
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
143
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
145 {
146         int freecommand;
147         int usedcommands;
148         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
149 }
150 DPSOFTRAST_State_Command_Pool);
151
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
153 {
154         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
155         float w[3];
156         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
157 }
158 DPSOFTRAST_State_Triangle);
159
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
165 }
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 }
176                                         
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
178
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
180 {
181         int triangle; // triangle this span was generated by
182         int x; // framebuffer x coord
183         int y; // framebuffer y coord
184         int startx; // usable range (according to pixelmask)
185         int endx; // usable range (according to pixelmask)
186         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
187 }
188 DPSOFTRAST_State_Span);
189
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
192
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
197
198 typedef enum DPSOFTRAST_BLENDMODE_e
199 {
200         DPSOFTRAST_BLENDMODE_OPAQUE,
201         DPSOFTRAST_BLENDMODE_ALPHA,
202         DPSOFTRAST_BLENDMODE_ADDALPHA,
203         DPSOFTRAST_BLENDMODE_ADD,
204         DPSOFTRAST_BLENDMODE_INVMOD,
205         DPSOFTRAST_BLENDMODE_MUL,
206         DPSOFTRAST_BLENDMODE_MUL2,
207         DPSOFTRAST_BLENDMODE_SUBALPHA,
208         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209         DPSOFTRAST_BLENDMODE_INVADD,
210         DPSOFTRAST_BLENDMODE_TOTAL
211 }
212 DPSOFTRAST_BLENDMODE;
213
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
215 {
216         void *thread;
217         int index;
218         
219         int cullface;
220         int colormask[4];
221         int blendfunc[2];
222         int blendsubtract;
223         int depthmask;
224         int depthtest;
225         int depthfunc;
226         int scissortest;
227         int alphatest;
228         int alphafunc;
229         float alphavalue;
230         int viewport[4];
231         int scissor[4];
232         float depthrange[2];
233         float polygonoffset[2];
234
235         int shader_mode;
236         int shader_permutation;
237         int shader_exactspecularmath;
238
239         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
240         
241         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
242         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
243
244         // DPSOFTRAST_VALIDATE_ flags
245         int validate;
246
247         // derived values (DPSOFTRAST_VALIDATE_FB)
248         int fb_colormask;
249         int fb_scissor[4];
250         ALIGN(float fb_viewportcenter[4]);
251         ALIGN(float fb_viewportscale[4]);
252
253         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
254         int fb_depthfunc;
255
256         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
257         int fb_blendmode;
258
259         // band boundaries
260         int miny1;
261         int maxy1;
262         int miny2;
263         int maxy2;
264
265         ATOMIC(volatile int commandoffset);
266
267         volatile bool waiting;
268         volatile bool starving;
269         void *waitcond;
270         void *drawcond;
271         void *drawmutex;
272
273         int numspans;
274         int numtriangles;
275         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
276         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
277 }
278 DPSOFTRAST_State_Thread);
279
280 typedef ATOMIC(struct DPSOFTRAST_State_s
281 {
282         int fb_width;
283         int fb_height;
284         unsigned int *fb_depthpixels;
285         unsigned int *fb_colorpixels[4];
286
287         int viewport[4];
288         ALIGN(float fb_viewportcenter[4]);
289         ALIGN(float fb_viewportscale[4]);
290
291         float color[4];
292         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
293         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
294
295         const float *pointer_vertex3f;
296         const float *pointer_color4f;
297         const unsigned char *pointer_color4ub;
298         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
299         int stride_vertex;
300         int stride_color;
301         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
303         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
304
305         int firstvertex;
306         int numvertices;
307         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
308         float *screencoord4f;
309         int drawstarty;
310         int drawendy;
311         int drawclipped;
312         
313         int shader_mode;
314         int shader_permutation;
315         int shader_exactspecularmath;
316
317         int texture_max;
318         int texture_end;
319         int texture_firstfree;
320         DPSOFTRAST_Texture *texture;
321
322         int bigendian;
323
324         // error reporting
325         const char *errorstring;
326
327         bool usethreads;
328         int interlace;
329         int numthreads;
330         DPSOFTRAST_State_Thread *threads;
331
332         ATOMIC(volatile int drawcommand);
333
334         DPSOFTRAST_State_Command_Pool commandpool;
335 }
336 DPSOFTRAST_State);
337
338 DPSOFTRAST_State dpsoftrast;
339
340 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
341 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
342 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
343 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
344 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
345
346 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
347 {
348         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
349         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
350         fb_viewportcenter[3] = 0.5f;
351         fb_viewportcenter[0] = 0.0f;
352         fb_viewportscale[1] = 0.5f * viewport[2];
353         fb_viewportscale[2] = -0.5f * viewport[3];
354         fb_viewportscale[3] = 0.5f;
355         fb_viewportscale[0] = 1.0f;
356 }
357
358 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
359 {
360         if (dpsoftrast.interlace)
361         {
362                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
363                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
364                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
365                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
366         }
367         else
368         {
369                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
370                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
371         }
372 }
373
374 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
375 {
376         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
377         // and viewport projection values
378         int x1, x2;
379         int y1, y2;
380         x1 = thread->scissor[0];
381         x2 = thread->scissor[0] + thread->scissor[2];
382         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
383         y2 = dpsoftrast.fb_height - thread->scissor[1];
384         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
385         if (x1 < 0) x1 = 0;
386         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
387         if (y1 < 0) y1 = 0;
388         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
389         thread->fb_scissor[0] = x1;
390         thread->fb_scissor[1] = y1;
391         thread->fb_scissor[2] = x2 - x1;
392         thread->fb_scissor[3] = y2 - y1;
393
394         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
395         DPSOFTRAST_RecalcThread(thread);
396 }
397
398 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
399 {
400         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
401 }
402
403 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
404 {
405         if (thread->blendsubtract)
406         {
407                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
408                 {
409                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
410                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
411                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
412                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
413                 }
414         }
415         else
416         {       
417                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
418                 {
419                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
420                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
421                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
422                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
423                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
424                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
425                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
426                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
427                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
428                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
429                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
430                 }
431         }
432 }
433
434 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
435
436 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
437 {
438         mask &= thread->validate;
439         if (!mask)
440                 return;
441         if (mask & DPSOFTRAST_VALIDATE_FB)
442         {
443                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
444                 DPSOFTRAST_RecalcFB(thread);
445         }
446         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
447         {
448                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
449                 DPSOFTRAST_RecalcDepthFunc(thread);
450         }
451         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
452         {
453                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
454                 DPSOFTRAST_RecalcBlendFunc(thread);
455         }
456 }
457
458 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
459 {
460         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
461                 return &dpsoftrast.texture[index];
462         return NULL;
463 }
464
465 static void DPSOFTRAST_Texture_Grow(void)
466 {
467         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
468         DPSOFTRAST_State_Thread *thread;
469         int i;
470         int j;
471         DPSOFTRAST_Flush();
472         // expand texture array as needed
473         if (dpsoftrast.texture_max < 1024)
474                 dpsoftrast.texture_max = 1024;
475         else
476                 dpsoftrast.texture_max *= 2;
477         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
478         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
479                 if (dpsoftrast.texbound[i])
480                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
481         for (j = 0; j < dpsoftrast.numthreads; j++)
482         {
483                 thread = &dpsoftrast.threads[j];
484                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
485                         if (thread->texbound[i])
486                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
487         }
488 }
489
490 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
491 {
492         int w;
493         int h;
494         int d;
495         int size;
496         int s;
497         int texnum;
498         int mipmaps;
499         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
500         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
501         DPSOFTRAST_Texture *texture;
502         if (width*height*depth < 1)
503         {
504                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
505                 return 0;
506         }
507         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
508         {
509                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
510                 return 0;
511         }
512         switch(texformat)
513         {
514         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
515         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
516         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
517                 break;
518         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
519                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
520                 {
521                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
522                         return 0;
523                 }
524                 if (depth != 1)
525                 {
526                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
527                         return 0;
528                 }
529                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
530                 {
531                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
532                         return 0;
533                 }
534                 break;
535         }
536         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
537         {
538                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
539                 return 0;
540         }
541         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
542         {
543                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
544                 return 0;
545         }
546         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
547         {
548                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
549                 return 0;
550         }
551         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
552         {
553                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
554                 return 0;
555         }
556         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
557         {
558                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
559                 return 0;
560         }
561         // find first empty slot in texture array
562         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
563                 if (!dpsoftrast.texture[texnum].bytes)
564                         break;
565         dpsoftrast.texture_firstfree = texnum + 1;
566         if (dpsoftrast.texture_max <= texnum)
567                 DPSOFTRAST_Texture_Grow();
568         if (dpsoftrast.texture_end <= texnum)
569                 dpsoftrast.texture_end = texnum + 1;
570         texture = &dpsoftrast.texture[texnum];
571         memset(texture, 0, sizeof(*texture));
572         texture->flags = flags;
573         texture->width = width;
574         texture->height = height;
575         texture->depth = depth;
576         texture->sides = sides;
577         texture->binds = 0;
578         w = width;
579         h = height;
580         d = depth;
581         size = 0;
582         mipmaps = 0;
583         w = width;
584         h = height;
585         d = depth;
586         for (;;)
587         {
588                 s = w * h * d * sides * 4;
589                 texture->mipmap[mipmaps][0] = size;
590                 texture->mipmap[mipmaps][1] = s;
591                 texture->mipmap[mipmaps][2] = w;
592                 texture->mipmap[mipmaps][3] = h;
593                 texture->mipmap[mipmaps][4] = d;
594                 size += s;
595                 mipmaps++;
596                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
597                         break;
598                 if (w > 1) w >>= 1;
599                 if (h > 1) h >>= 1;
600                 if (d > 1) d >>= 1;
601         }
602         texture->mipmaps = mipmaps;
603         texture->size = size;
604
605         // allocate the pixels now
606         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
607
608         return texnum;
609 }
610 void DPSOFTRAST_Texture_Free(int index)
611 {
612         DPSOFTRAST_Texture *texture;
613         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
614         if (texture->binds)
615                 DPSOFTRAST_Flush();
616         if (texture->bytes)
617                 MM_FREE(texture->bytes);
618         texture->bytes = NULL;
619         memset(texture, 0, sizeof(*texture));
620         // adjust the free range and used range
621         if (dpsoftrast.texture_firstfree > index)
622                 dpsoftrast.texture_firstfree = index;
623         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
624                 dpsoftrast.texture_end--;
625 }
626 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
627 {
628         int i, x, y, z, w, layer0, layer1, row0, row1;
629         unsigned char *o, *i0, *i1, *i2, *i3;
630         DPSOFTRAST_Texture *texture;
631         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
632         if (texture->mipmaps <= 1)
633                 return;
634         for (i = 1;i < texture->mipmaps;i++)
635         {
636                 for (z = 0;z < texture->mipmap[i][4];z++)
637                 {
638                         layer0 = z*2;
639                         layer1 = z*2+1;
640                         if (layer1 >= texture->mipmap[i-1][4])
641                                 layer1 = texture->mipmap[i-1][4]-1;
642                         for (y = 0;y < texture->mipmap[i][3];y++)
643                         {
644                                 row0 = y*2;
645                                 row1 = y*2+1;
646                                 if (row1 >= texture->mipmap[i-1][3])
647                                         row1 = texture->mipmap[i-1][3]-1;
648                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
649                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
650                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
651                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
652                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
653                                 w = texture->mipmap[i][2];
654                                 if (layer1 > layer0)
655                                 {
656                                         if (texture->mipmap[i-1][2] > 1)
657                                         {
658                                                 // average 3D texture
659                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
660                                                 {
661                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
662                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
663                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
664                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
665                                                 }
666                                         }
667                                         else
668                                         {
669                                                 // average 3D mipmap with parent width == 1
670                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
671                                                 {
672                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
673                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
674                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
675                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
676                                                 }
677                                         }
678                                 }
679                                 else
680                                 {
681                                         if (texture->mipmap[i-1][2] > 1)
682                                         {
683                                                 // average 2D texture (common case)
684                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
685                                                 {
686                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
687                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
688                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
689                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
690                                                 }
691                                         }
692                                         else
693                                         {
694                                                 // 2D texture with parent width == 1
695                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
696                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
697                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
698                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
699                                         }
700                                 }
701                         }
702                 }
703         }
704 }
705 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
706 {
707         DPSOFTRAST_Texture *texture;
708         unsigned char *dst;
709         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
710         if (texture->binds)
711                 DPSOFTRAST_Flush();
712         if (pixels)
713         {
714                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
715                 while (blockheight > 0)
716                 {
717                         memcpy(dst, pixels, blockwidth * 4);
718                         pixels += blockwidth * 4;
719                         dst += texture->mipmap[0][2] * 4;
720                         blockheight--;
721                 }
722         }
723         DPSOFTRAST_Texture_CalculateMipmaps(index);
724 }
725 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
726 {
727         DPSOFTRAST_Texture *texture;
728         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
729         if (texture->binds)
730                 DPSOFTRAST_Flush();
731         if (pixels)
732                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
733         DPSOFTRAST_Texture_CalculateMipmaps(index);
734 }
735 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
736 {
737         DPSOFTRAST_Texture *texture;
738         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
739         return texture->mipmap[mip][2];
740 }
741 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
742 {
743         DPSOFTRAST_Texture *texture;
744         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
745         return texture->mipmap[mip][3];
746 }
747 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
748 {
749         DPSOFTRAST_Texture *texture;
750         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
751         return texture->mipmap[mip][4];
752 }
753 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
754 {
755         DPSOFTRAST_Texture *texture;
756         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
757         if (texture->binds)
758                 DPSOFTRAST_Flush();
759         return texture->bytes + texture->mipmap[mip][0];
760 }
761 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
762 {
763         DPSOFTRAST_Texture *texture;
764         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
765         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
766         {
767                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
768                 return;
769         }
770         if (texture->binds)
771                 DPSOFTRAST_Flush();
772         texture->filter = filter;
773 }
774
775 static void DPSOFTRAST_Draw_FlushThreads(void);
776
777 static void DPSOFTRAST_Draw_SyncCommands(void)
778 {
779         if(dpsoftrast.usethreads) MEMORY_BARRIER;
780         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
781 }
782
783 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
784 {
785         DPSOFTRAST_State_Thread *thread;
786         int i;
787         int freecommand = dpsoftrast.commandpool.freecommand;
788         int usedcommands = dpsoftrast.commandpool.usedcommands;
789         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
790                 return;
791         DPSOFTRAST_Draw_SyncCommands();
792         for(;;)
793         {
794                 int waitindex = -1;
795                 int commandoffset;
796                 usedcommands = 0;
797                 for (i = 0; i < dpsoftrast.numthreads; i++)
798                 {
799                         thread = &dpsoftrast.threads[i]; 
800                         commandoffset = freecommand - thread->commandoffset;
801                         if (commandoffset < 0)
802                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
803                         if (commandoffset > usedcommands)
804                         {
805                                 waitindex = i;
806                                 usedcommands = commandoffset;
807                         }
808                 }
809                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
810                         break;
811                 thread = &dpsoftrast.threads[waitindex];
812                 Thread_LockMutex(thread->drawmutex);
813                 if (thread->commandoffset != dpsoftrast.drawcommand)
814                 {
815                         thread->waiting = true;
816                         if (thread->starving) Thread_CondSignal(thread->drawcond);
817                         Thread_CondWait(thread->waitcond, thread->drawmutex);
818                         thread->waiting = false;
819                 }
820                 Thread_UnlockMutex(thread->drawmutex);
821         }
822         dpsoftrast.commandpool.usedcommands = usedcommands;
823 }
824
825 #define DPSOFTRAST_ALIGNCOMMAND(size) \
826         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
827 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
828         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
829
830 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
831 {
832         DPSOFTRAST_Command *command;
833         int freecommand = dpsoftrast.commandpool.freecommand;
834         int usedcommands = dpsoftrast.commandpool.usedcommands;
835         int extra = sizeof(DPSOFTRAST_Command);
836         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
837                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
838         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
839         {
840                 if (dpsoftrast.usethreads)
841                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
842                 else
843                         DPSOFTRAST_Draw_FlushThreads();
844                 freecommand = dpsoftrast.commandpool.freecommand;
845                 usedcommands = dpsoftrast.commandpool.usedcommands;
846         }
847         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
848         {
849                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
850                 command->opcode = DPSOFTRAST_OPCODE_Reset;
851                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
852                 freecommand = 0;
853         }
854         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
855         command->opcode = opcode;
856         command->commandsize = size;
857         freecommand += size;
858         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
859                 freecommand = 0;
860         dpsoftrast.commandpool.freecommand = freecommand;
861         dpsoftrast.commandpool.usedcommands = usedcommands + size;
862         return command;
863 }
864
865 static void DPSOFTRAST_UndoCommand(int size)
866 {
867         int freecommand = dpsoftrast.commandpool.freecommand;
868         int usedcommands = dpsoftrast.commandpool.usedcommands;
869         freecommand -= size;
870         if (freecommand < 0)
871                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
872         usedcommands -= size;
873         dpsoftrast.commandpool.freecommand = freecommand;
874         dpsoftrast.commandpool.usedcommands = usedcommands;
875 }
876                 
877 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
878 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
879 {
880         thread->viewport[0] = command->x;
881         thread->viewport[1] = command->y;
882         thread->viewport[2] = command->width;
883         thread->viewport[3] = command->height;
884         thread->validate |= DPSOFTRAST_VALIDATE_FB;
885 }
886 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
887 {
888         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
889         command->x = x;
890         command->y = y;
891         command->width = width;
892         command->height = height;
893
894         dpsoftrast.viewport[0] = x;
895         dpsoftrast.viewport[1] = y;
896         dpsoftrast.viewport[2] = width;
897         dpsoftrast.viewport[3] = height;
898         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
899 }
900
901 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
902 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
903 {
904         int i, x1, y1, x2, y2, w, h, x, y;
905         int miny1, maxy1, miny2, maxy2;
906         int bandy;
907         unsigned int *p;
908         unsigned int c;
909         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
910         miny1 = thread->miny1;
911         maxy1 = thread->maxy1;
912         miny2 = thread->miny2;
913         maxy2 = thread->maxy2;
914         x1 = thread->fb_scissor[0];
915         y1 = thread->fb_scissor[1];
916         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
917         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
918         if (y1 < miny1) y1 = miny1;
919         if (y2 > maxy2) y2 = maxy2;
920         w = x2 - x1;
921         h = y2 - y1;
922         if (w < 1 || h < 1)
923                 return;
924         // FIXME: honor fb_colormask?
925         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
926         for (i = 0;i < 4;i++)
927         {
928                 if (!dpsoftrast.fb_colorpixels[i])
929                         continue;
930                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
931                 for (;y < bandy;y++)
932                 {
933                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
934                         for (x = x1;x < x2;x++)
935                                 p[x] = c;
936                 }
937         }
938 }
939 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
940 {
941         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
942         command->r = r;
943         command->g = g;
944         command->b = b;
945         command->a = a;
946 }
947
948 DEFCOMMAND(3, ClearDepth, float depth;)
949 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
950 {
951         int x1, y1, x2, y2, w, h, x, y;
952         int miny1, maxy1, miny2, maxy2;
953         int bandy;
954         unsigned int *p;
955         unsigned int c;
956         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
957         miny1 = thread->miny1;
958         maxy1 = thread->maxy1;
959         miny2 = thread->miny2;
960         maxy2 = thread->maxy2;
961         x1 = thread->fb_scissor[0];
962         y1 = thread->fb_scissor[1];
963         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
964         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
965         if (y1 < miny1) y1 = miny1;
966         if (y2 > maxy2) y2 = maxy2;
967         w = x2 - x1;
968         h = y2 - y1;
969         if (w < 1 || h < 1)
970                 return;
971         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
972         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
973         for (;y < bandy;y++)
974         {
975                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
976                 for (x = x1;x < x2;x++)
977                         p[x] = c;
978         }
979 }
980 void DPSOFTRAST_ClearDepth(float d)
981 {
982         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
983         command->depth = d;
984 }
985
986 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
987 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
988 {
989         thread->colormask[0] = command->r != 0;
990         thread->colormask[1] = command->g != 0;
991         thread->colormask[2] = command->b != 0;
992         thread->colormask[3] = command->a != 0;
993         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
994 }
995 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
996 {
997         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
998         command->r = r;
999         command->g = g;
1000         command->b = b;
1001         command->a = a;
1002 }
1003
1004 DEFCOMMAND(5, DepthTest, int enable;)
1005 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1006 {
1007         thread->depthtest = command->enable;
1008         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1009 }
1010 void DPSOFTRAST_DepthTest(int enable)
1011 {
1012         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1013         command->enable = enable;
1014 }
1015
1016 DEFCOMMAND(6, ScissorTest, int enable;)
1017 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1018 {
1019         thread->scissortest = command->enable;
1020         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1021 }
1022 void DPSOFTRAST_ScissorTest(int enable)
1023 {
1024         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1025         command->enable = enable;
1026 }
1027
1028 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1029 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1030 {
1031         thread->scissor[0] = command->x;
1032         thread->scissor[1] = command->y;
1033         thread->scissor[2] = command->width;
1034         thread->scissor[3] = command->height;
1035         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1036 }
1037 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1038 {
1039         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1040         command->x = x;
1041         command->y = y;
1042         command->width = width;
1043         command->height = height;
1044 }
1045
1046 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1047 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1048 {
1049         thread->blendfunc[0] = command->sfactor;
1050         thread->blendfunc[1] = command->dfactor;
1051         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1052 }
1053 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1054 {
1055         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1056         command->sfactor = sfactor;
1057         command->dfactor = dfactor;
1058 }
1059
1060 DEFCOMMAND(9, BlendSubtract, int enable;)
1061 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1062 {
1063         thread->blendsubtract = command->enable;
1064         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1065 }
1066 void DPSOFTRAST_BlendSubtract(int enable)
1067 {
1068         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1069         command->enable = enable;
1070 }
1071
1072 DEFCOMMAND(10, DepthMask, int enable;)
1073 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1074 {
1075         thread->depthmask = command->enable;
1076 }
1077 void DPSOFTRAST_DepthMask(int enable)
1078 {
1079         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1080         command->enable = enable;
1081 }
1082
1083 DEFCOMMAND(11, DepthFunc, int func;)
1084 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1085 {
1086         thread->depthfunc = command->func;
1087 }
1088 void DPSOFTRAST_DepthFunc(int func)
1089 {
1090         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1091         command->func = func;
1092 }
1093
1094 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1095 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1096 {
1097         thread->depthrange[0] = command->nearval;
1098         thread->depthrange[1] = command->farval;
1099 }
1100 void DPSOFTRAST_DepthRange(float nearval, float farval)
1101 {
1102         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1103         command->nearval = nearval;
1104         command->farval = farval;
1105 }
1106
1107 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1108 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1109 {
1110         thread->polygonoffset[0] = command->alongnormal;
1111         thread->polygonoffset[1] = command->intoview;
1112 }
1113 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1114 {
1115         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1116         command->alongnormal = alongnormal;
1117         command->intoview = intoview;
1118 }
1119
1120 DEFCOMMAND(14, CullFace, int mode;)
1121 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1122 {
1123         thread->cullface = command->mode;
1124 }
1125 void DPSOFTRAST_CullFace(int mode)
1126 {
1127         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1128         command->mode = mode;
1129 }
1130
1131 DEFCOMMAND(15, AlphaTest, int enable;)
1132 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1133 {
1134         thread->alphatest = command->enable;
1135 }
1136 void DPSOFTRAST_AlphaTest(int enable)
1137 {
1138         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1139         command->enable = enable;
1140 }
1141
1142 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1143 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1144 {
1145         thread->alphafunc = command->func;
1146         thread->alphavalue = command->ref;
1147 }
1148 void DPSOFTRAST_AlphaFunc(int func, float ref)
1149 {
1150         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1151         command->func = func;
1152         command->ref = ref;
1153 }
1154
1155 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1156 {
1157         dpsoftrast.color[0] = r;
1158         dpsoftrast.color[1] = g;
1159         dpsoftrast.color[2] = b;
1160         dpsoftrast.color[3] = a;
1161 }
1162
1163 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1164 {
1165         int outstride = blockwidth * 4;
1166         int instride = dpsoftrast.fb_width * 4;
1167         int bx1 = blockx;
1168         int by1 = blocky;
1169         int bx2 = blockx + blockwidth;
1170         int by2 = blocky + blockheight;
1171         int bw;
1172         int x;
1173         int y;
1174         unsigned char *inpixels;
1175         unsigned char *b;
1176         unsigned char *o;
1177         DPSOFTRAST_Flush();
1178         if (bx1 < 0) bx1 = 0;
1179         if (by1 < 0) by1 = 0;
1180         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1181         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1182         bw = bx2 - bx1;
1183         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1184         if (dpsoftrast.bigendian)
1185         {
1186                 for (y = by1;y < by2;y++)
1187                 {
1188                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1189                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1190                         for (x = bx1;x < bx2;x++)
1191                         {
1192                                 o[0] = b[3];
1193                                 o[1] = b[2];
1194                                 o[2] = b[1];
1195                                 o[3] = b[0];
1196                                 o += 4;
1197                                 b += 4;
1198                         }
1199                 }
1200         }
1201         else
1202         {
1203                 for (y = by1;y < by2;y++)
1204                 {
1205                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1206                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1207                         memcpy(o, b, bw*4);
1208                 }
1209         }
1210
1211 }
1212 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1213 {
1214         int tx1 = tx;
1215         int ty1 = ty;
1216         int tx2 = tx + width;
1217         int ty2 = ty + height;
1218         int sx1 = sx;
1219         int sy1 = sy;
1220         int sx2 = sx + width;
1221         int sy2 = sy + height;
1222         int swidth;
1223         int sheight;
1224         int twidth;
1225         int theight;
1226         int sw;
1227         int sh;
1228         int tw;
1229         int th;
1230         int y;
1231         unsigned int *spixels;
1232         unsigned int *tpixels;
1233         DPSOFTRAST_Texture *texture;
1234         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1235         if (mip < 0 || mip >= texture->mipmaps) return;
1236         DPSOFTRAST_Flush();
1237         spixels = dpsoftrast.fb_colorpixels[0];
1238         swidth = dpsoftrast.fb_width;
1239         sheight = dpsoftrast.fb_height;
1240         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1241         twidth = texture->mipmap[mip][2];
1242         theight = texture->mipmap[mip][3];
1243         if (tx1 < 0) tx1 = 0;
1244         if (ty1 < 0) ty1 = 0;
1245         if (tx2 > twidth) tx2 = twidth;
1246         if (ty2 > theight) ty2 = theight;
1247         if (sx1 < 0) sx1 = 0;
1248         if (sy1 < 0) sy1 = 0;
1249         if (sx2 > swidth) sx2 = swidth;
1250         if (sy2 > sheight) sy2 = sheight;
1251         tw = tx2 - tx1;
1252         th = ty2 - ty1;
1253         sw = sx2 - sx1;
1254         sh = sy2 - sy1;
1255         if (tw > sw) tw = sw;
1256         if (th > sh) th = sh;
1257         if (tw < 1 || th < 1)
1258                 return;
1259         sy1 = sheight - 1 - sy1;
1260         for (y = 0;y < th;y++)
1261                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1262         if (texture->mipmaps > 1)
1263                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1264 }
1265
1266 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1267 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1268 {
1269         if (thread->texbound[command->unitnum])
1270                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1271         thread->texbound[command->unitnum] = command->texture;
1272 }
1273 void DPSOFTRAST_SetTexture(int unitnum, int index)
1274 {
1275         DPSOFTRAST_Command_SetTexture *command;
1276         DPSOFTRAST_Texture *texture;
1277         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1278         {
1279                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1280                 return;
1281         }
1282         texture = DPSOFTRAST_Texture_GetByIndex(index);
1283         if (index && !texture)
1284         {
1285                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1286                 return;
1287         }
1288
1289         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1290         command->unitnum = unitnum;
1291         command->texture = texture;
1292
1293         dpsoftrast.texbound[unitnum] = texture;
1294         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1295 }
1296
1297 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1298 {
1299         dpsoftrast.pointer_vertex3f = vertex3f;
1300         dpsoftrast.stride_vertex = stride;
1301 }
1302 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1303 {
1304         dpsoftrast.pointer_color4f = color4f;
1305         dpsoftrast.pointer_color4ub = NULL;
1306         dpsoftrast.stride_color = stride;
1307 }
1308 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1309 {
1310         dpsoftrast.pointer_color4f = NULL;
1311         dpsoftrast.pointer_color4ub = color4ub;
1312         dpsoftrast.stride_color = stride;
1313 }
1314 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1315 {
1316         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1317         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1318         dpsoftrast.stride_texcoord[unitnum] = stride;
1319 }
1320
1321 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1322 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1323 {
1324         thread->shader_mode = command->mode;
1325         thread->shader_permutation = command->permutation;
1326         thread->shader_exactspecularmath = command->exactspecularmath;
1327 }
1328 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1329 {
1330         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1331         command->mode = mode;
1332         command->permutation = permutation;
1333         command->exactspecularmath = exactspecularmath;
1334
1335         dpsoftrast.shader_mode = mode;
1336         dpsoftrast.shader_permutation = permutation;
1337         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1338 }
1339
1340 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1341 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1342 {
1343         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1344 }
1345 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1346 {
1347         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1348         command->index = index;
1349         command->val[0] = v0;
1350         command->val[1] = v1;
1351         command->val[2] = v2;
1352         command->val[3] = v3;
1353
1354         dpsoftrast.uniform4f[index*4+0] = v0;
1355         dpsoftrast.uniform4f[index*4+1] = v1;
1356         dpsoftrast.uniform4f[index*4+2] = v2;
1357         dpsoftrast.uniform4f[index*4+3] = v3;
1358 }
1359 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1360 {
1361         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1362         command->index = index;
1363         memcpy(command->val, v, sizeof(command->val));
1364
1365         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1366 }
1367
1368 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1369 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1370 {
1371         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1372 }
1373 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1374 {
1375 #ifdef SSE2_PRESENT
1376         int i, index;
1377         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1378         {
1379                 __m128 m0, m1, m2, m3;
1380                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1381                 command->index = (DPSOFTRAST_UNIFORM)index;
1382                 if (((size_t)v)&(ALIGN_SIZE-1))
1383                 {
1384                         m0 = _mm_loadu_ps(v);
1385                         m1 = _mm_loadu_ps(v+4);
1386                         m2 = _mm_loadu_ps(v+8);
1387                         m3 = _mm_loadu_ps(v+12);
1388                 }
1389                 else
1390                 {
1391                         m0 = _mm_load_ps(v);
1392                         m1 = _mm_load_ps(v+4);
1393                         m2 = _mm_load_ps(v+8);
1394                         m3 = _mm_load_ps(v+12);
1395                 }
1396                 if (transpose)
1397                 {
1398                         __m128 t0, t1, t2, t3;
1399                         t0 = _mm_unpacklo_ps(m0, m1);
1400                         t1 = _mm_unpacklo_ps(m2, m3);
1401                         t2 = _mm_unpackhi_ps(m0, m1);
1402                         t3 = _mm_unpackhi_ps(m2, m3);
1403                         m0 = _mm_movelh_ps(t0, t1);
1404                         m1 = _mm_movehl_ps(t1, t0);
1405                         m2 = _mm_movelh_ps(t2, t3);
1406                         m3 = _mm_movehl_ps(t3, t2);                     
1407                 }
1408                 _mm_store_ps(command->val, m0);
1409                 _mm_store_ps(command->val+4, m1);
1410                 _mm_store_ps(command->val+8, m2);
1411                 _mm_store_ps(command->val+12, m3);
1412                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1413                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1414                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1415                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1416         }
1417 #endif
1418 }
1419
1420 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1421 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1422 {
1423         thread->uniform1i[command->index] = command->val;
1424 }
1425 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1426 {
1427         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1428         command->index = index;
1429         command->val = i0;
1430
1431         dpsoftrast.uniform1i[command->index] = i0;
1432 }
1433
1434 #ifdef SSE2_PRESENT
1435 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1436 {
1437         float *end = dst + size*4;
1438         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1439         {
1440                 while (dst < end)
1441                 {
1442                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1443                         dst += 4;
1444                         src += stride;
1445                 }
1446         }
1447         else
1448         {
1449                 while (dst < end)
1450                 {
1451                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1452                         dst += 4;
1453                         src += stride;
1454                 }
1455         }
1456 }
1457
1458 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1459 {
1460         float *end = dst + size*4;
1461         if (stride == sizeof(float[3]))
1462         {
1463                 float *end4 = dst + (size&~3)*4;        
1464                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1465                 {
1466                         while (dst < end4)
1467                         {
1468                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1469                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1470                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1471                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1472                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1473                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1474                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1476                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1477                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1478                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1479                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1480                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1481                                 dst += 16;
1482                                 src += 4*sizeof(float[3]);
1483                         }
1484                 }
1485                 else
1486                 {
1487                         while (dst < end4)
1488                         {
1489                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1490                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1491                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1492                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1493                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1494                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1495                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1496                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1497                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1498                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1501                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1502                                 dst += 16;
1503                                 src += 4*sizeof(float[3]);
1504                         }
1505                 }
1506         }
1507         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1508         {
1509                 while (dst < end)
1510                 {
1511                         __m128 v = _mm_loadu_ps((const float *)src);
1512                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515                         _mm_store_ps(dst, v);
1516                         dst += 4;
1517                         src += stride;
1518                 }
1519         }
1520         else
1521         {
1522                 while (dst < end)
1523                 {
1524                         __m128 v = _mm_load_ps((const float *)src);
1525                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1526                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1527                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1528                         _mm_store_ps(dst, v);
1529                         dst += 4;
1530                         src += stride;
1531                 }
1532         }
1533 }
1534
1535 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1536 {
1537         float *end = dst + size*4;
1538         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1539         if (stride == sizeof(float[2]))
1540         {
1541                 float *end2 = dst + (size&~1)*4;
1542                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1543                 {
1544                         while (dst < end2)
1545                         {
1546                                 __m128 v = _mm_loadu_ps((const float *)src);
1547                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1548                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1549                                 dst += 8;
1550                                 src += 2*sizeof(float[2]);
1551                         }
1552                 }
1553                 else
1554                 {
1555                         while (dst < end2)
1556                         {
1557                                 __m128 v = _mm_load_ps((const float *)src);
1558                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1559                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1560                                 dst += 8;
1561                                 src += 2*sizeof(float[2]);
1562                         }
1563                 }
1564         }
1565         while (dst < end)
1566         {
1567                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1568                 dst += 4;
1569                 src += stride;
1570         }
1571 }
1572
1573 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1574 {
1575         float *end = dst + size*4;
1576         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1577         if (stride == sizeof(unsigned char[4]))
1578         {
1579                 float *end4 = dst + (size&~3)*4;
1580                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1581                 {
1582                         while (dst < end4)
1583                         {
1584                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1589                                 dst += 16;
1590                                 src += 4*sizeof(unsigned char[4]);
1591                         }
1592                 }
1593                 else
1594                 {
1595                         while (dst < end4)
1596                         {
1597                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1598                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1599                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1600                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1601                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1602                                 dst += 16;
1603                                 src += 4*sizeof(unsigned char[4]);
1604                         }
1605                 }
1606         }
1607         while (dst < end)
1608         {
1609                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1610                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1611                 dst += 4;
1612                 src += stride;
1613         }
1614 }
1615
1616 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1617 {
1618         float *end = dst + 4*size;
1619         __m128 v = _mm_loadu_ps(src);
1620         while (dst < end)
1621         {
1622                 _mm_store_ps(dst, v);
1623                 dst += 4;
1624         }
1625 }
1626 #endif
1627
1628 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1629 {
1630 #ifdef SSE2_PRESENT
1631         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1632         __m128 m0, m1, m2, m3;
1633         float *end;
1634         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1635         {
1636                 // fast case for identity matrix
1637                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1638                 return;
1639         }
1640         end = out4f + numitems*4;
1641         m0 = _mm_loadu_ps(inmatrix16f);
1642         m1 = _mm_loadu_ps(inmatrix16f + 4);
1643         m2 = _mm_loadu_ps(inmatrix16f + 8);
1644         m3 = _mm_loadu_ps(inmatrix16f + 12);
1645         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1646         {
1647                 while (out4f < end)
1648                 {
1649                         __m128 v = _mm_loadu_ps(in4f);
1650                         _mm_store_ps(out4f,
1651                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1652                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1653                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1654                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1655                         out4f += 4;
1656                         in4f += 4;
1657                 }
1658         }
1659         else
1660         {
1661                 while (out4f < end)
1662                 {
1663                         __m128 v = _mm_load_ps(in4f);
1664                         _mm_store_ps(out4f,
1665                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1666                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1667                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1668                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1669                         out4f += 4;
1670                         in4f += 4;
1671                 }
1672         }
1673 #endif
1674 }
1675
1676 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1677 {
1678         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1679 }
1680
1681 #ifdef SSE2_PRESENT
1682 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1683 { \
1684         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1685         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1686         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1687         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1688 }
1689
1690 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1691 { \
1692         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1693         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1694         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1695         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1696 }
1697
1698 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1699 { \
1700         __m128 p = (in); \
1701         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1702                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1703                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1704                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1705 }
1706
1707 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1708 {
1709         int clipmask = 0xFF;
1710         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1711         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1712         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1713         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1714         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1715         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1716         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1717         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1718         #define BBFRONT(k, pos) \
1719         { \
1720                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1721                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1722                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1723                 { \
1724                         __m128 proj; \
1725                         clipmask &= ~(1<<k); \
1726                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1727                         minproj = _mm_min_ss(minproj, proj); \
1728                         maxproj = _mm_max_ss(maxproj, proj); \
1729                 } \
1730         }
1731         BBFRONT(0, minpos); 
1732         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1733         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1734         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1735         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1736         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1737         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1738         BBFRONT(7, maxpos);
1739         #define BBCLIP(k) \
1740         { \
1741                 if (clipmask&(1<<k)) \
1742                 { \
1743                         if (!(clipmask&(1<<(k^1)))) \
1744                         { \
1745                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1746                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1747                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748                                 minproj = _mm_min_ss(minproj, proj); \
1749                                 maxproj = _mm_max_ss(maxproj, proj); \
1750                         } \
1751                         if (!(clipmask&(1<<(k^2)))) \
1752                         { \
1753                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1754                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1755                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1756                                 minproj = _mm_min_ss(minproj, proj); \
1757                                 maxproj = _mm_max_ss(maxproj, proj); \
1758                         } \
1759                         if (!(clipmask&(1<<(k^4)))) \
1760                         { \
1761                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1762                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1763                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1764                                 minproj = _mm_min_ss(minproj, proj); \
1765                                 maxproj = _mm_max_ss(maxproj, proj); \
1766                         } \
1767                 } \
1768         }
1769         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1770         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1771         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1772         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1773         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1774         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1775         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1776         *starty = _mm_cvttss_si32(maxproj);
1777         *endy = _mm_cvttss_si32(minproj)+1;
1778         return clipmask;
1779 }
1780         
1781 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1782 {
1783         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1784         float *end = out4f + numitems*4;
1785         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1786         __m128 minpos, maxpos;
1787         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1788         {
1789                 minpos = maxpos = _mm_loadu_ps(in4f);
1790                 while (out4f < end)
1791                 {
1792                         __m128 v = _mm_loadu_ps(in4f);
1793                         minpos = _mm_min_ps(minpos, v);
1794                         maxpos = _mm_max_ps(maxpos, v);
1795                         _mm_store_ps(out4f, v);
1796                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1797                         _mm_store_ps(screen4f, v);
1798                         in4f += 4;
1799                         out4f += 4;
1800                         screen4f += 4;
1801                 }
1802         }
1803         else
1804         {
1805                 minpos = maxpos = _mm_load_ps(in4f);
1806                 while (out4f < end)
1807                 {
1808                         __m128 v = _mm_load_ps(in4f);
1809                         minpos = _mm_min_ps(minpos, v);
1810                         maxpos = _mm_max_ps(maxpos, v);
1811                         _mm_store_ps(out4f, v);
1812                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1813                         _mm_store_ps(screen4f, v);
1814                         in4f += 4;
1815                         out4f += 4;
1816                         screen4f += 4;
1817                 }
1818         }
1819         if (starty && endy) 
1820         {
1821                 ALIGN(float minposf[4]);
1822                 ALIGN(float maxposf[4]);
1823                 _mm_store_ps(minposf, minpos);
1824                 _mm_store_ps(maxposf, maxpos);
1825                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1826         }
1827         return 0;
1828 }
1829
1830 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1831 {
1832         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1833         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1834         float *end;
1835         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1836                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1837         end = out4f + numitems*4;
1838         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1839         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1840         m0 = _mm_loadu_ps(inmatrix16f);
1841         m1 = _mm_loadu_ps(inmatrix16f + 4);
1842         m2 = _mm_loadu_ps(inmatrix16f + 8);
1843         m3 = _mm_loadu_ps(inmatrix16f + 12);
1844         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1845         {
1846                 minpos = maxpos = _mm_loadu_ps(in4f);
1847                 while (out4f < end)
1848                 {
1849                         __m128 v = _mm_loadu_ps(in4f);
1850                         minpos = _mm_min_ps(minpos, v);
1851                         maxpos = _mm_max_ps(maxpos, v);
1852                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1853                         _mm_store_ps(out4f, v);
1854                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1855                         _mm_store_ps(screen4f, v);
1856                         in4f += 4;
1857                         out4f += 4;
1858                         screen4f += 4;
1859                 }
1860         }
1861         else
1862         {
1863                 minpos = maxpos = _mm_load_ps(in4f);
1864                 while (out4f < end)
1865                 {
1866                         __m128 v = _mm_load_ps(in4f);
1867                         minpos = _mm_min_ps(minpos, v);
1868                         maxpos = _mm_max_ps(maxpos, v);
1869                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1870                         _mm_store_ps(out4f, v);
1871                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1872                         _mm_store_ps(screen4f, v);
1873                         in4f += 4;
1874                         out4f += 4;
1875                         screen4f += 4;
1876                 }
1877         }
1878         if (starty && endy) 
1879         {
1880                 ALIGN(float minposf[4]);
1881                 ALIGN(float maxposf[4]);
1882                 _mm_store_ps(minposf, minpos);
1883                 _mm_store_ps(maxposf, maxpos);
1884                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1885         }
1886         return 0;
1887 }
1888 #endif
1889
1890 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1891 {
1892 #ifdef SSE2_PRESENT
1893         float *outf = dpsoftrast.post_array4f[outarray];
1894         const unsigned char *inb;
1895         int firstvertex = dpsoftrast.firstvertex;
1896         int numvertices = dpsoftrast.numvertices;
1897         int stride;
1898         switch(inarray)
1899         {
1900         case DPSOFTRAST_ARRAY_POSITION:
1901                 stride = dpsoftrast.stride_vertex;
1902                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1903                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1904                 break;
1905         case DPSOFTRAST_ARRAY_COLOR:
1906                 stride = dpsoftrast.stride_color;
1907                 if (dpsoftrast.pointer_color4f)
1908                 {
1909                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1910                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1911                 }
1912                 else if (dpsoftrast.pointer_color4ub)
1913                 {
1914                         stride = dpsoftrast.stride_color;
1915                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1916                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1917                 }
1918                 else
1919                 {
1920                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1921                 }
1922                 break;
1923         default:
1924                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1925                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1926                 {
1927                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1928                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1929                         {
1930                         case 2:
1931                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1932                                 break;
1933                         case 3:
1934                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1935                                 break;
1936                         case 4:
1937                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1938                                 break;
1939                         }
1940                 }
1941                 break;
1942         }
1943         return outf;
1944 #else
1945         return NULL;
1946 #endif
1947 }
1948
1949 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1950 {
1951         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1952         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1953         return data;
1954 }
1955
1956 #if 0
1957 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1958 {
1959 #ifdef SSE2_PRESENT
1960         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1961         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1962         return data;
1963 #else
1964         return NULL;
1965 #endif
1966 }
1967 #endif
1968
1969 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1970 {
1971 #ifdef SSE2_PRESENT
1972         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1973         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1974         return data;
1975 #else
1976         return NULL;
1977 #endif
1978 }
1979
1980 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1981 {
1982         int x;
1983         int startx = span->startx;
1984         int endx = span->endx;
1985         float wslope = triangle->w[0];
1986         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1987         float endz = 1.0f / (w + wslope * startx);
1988         for (x = startx;x < endx;)
1989         {
1990                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1991                 float z = endz, dz;
1992                 if (nextsub >= endx) nextsub = endsub = endx-1;
1993                 endz = 1.0f / (w + wslope * nextsub);
1994                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1995                 for (; x <= endsub; x++, z += dz)
1996                         zf[x] = z;
1997         }
1998 }
1999
2000 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
2001 {
2002         int x;
2003         int startx = span->startx;
2004         int endx = span->endx;
2005         int d[4];
2006         float a, b;
2007         unsigned char * RESTRICT pixelmask = span->pixelmask;
2008         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2009         if (!pixel)
2010                 return;
2011         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2012         // handle alphatest now (this affects depth writes too)
2013         if (thread->alphatest)
2014                 for (x = startx;x < endx;x++)
2015                         if (in4f[x*4+3] < 0.5f)
2016                                 pixelmask[x] = false;
2017         // FIXME: this does not handle bigendian
2018         switch(thread->fb_blendmode)
2019         {
2020         case DPSOFTRAST_BLENDMODE_OPAQUE:
2021                 for (x = startx;x < endx;x++)
2022                 {
2023                         if (!pixelmask[x])
2024                                 continue;
2025                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2026                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2027                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2028                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2029                         pixel[x*4+0] = d[0];
2030                         pixel[x*4+1] = d[1];
2031                         pixel[x*4+2] = d[2];
2032                         pixel[x*4+3] = d[3];
2033                 }
2034                 break;
2035         case DPSOFTRAST_BLENDMODE_ALPHA:
2036                 for (x = startx;x < endx;x++)
2037                 {
2038                         if (!pixelmask[x])
2039                                 continue;
2040                         a = in4f[x*4+3] * 255.0f;
2041                         b = 1.0f - in4f[x*4+3];
2042                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2043                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2044                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2045                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2046                         pixel[x*4+0] = d[0];
2047                         pixel[x*4+1] = d[1];
2048                         pixel[x*4+2] = d[2];
2049                         pixel[x*4+3] = d[3];
2050                 }
2051                 break;
2052         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2053                 for (x = startx;x < endx;x++)
2054                 {
2055                         if (!pixelmask[x])
2056                                 continue;
2057                         a = in4f[x*4+3] * 255.0f;
2058                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2059                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2060                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2061                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2062                         pixel[x*4+0] = d[0];
2063                         pixel[x*4+1] = d[1];
2064                         pixel[x*4+2] = d[2];
2065                         pixel[x*4+3] = d[3];
2066                 }
2067                 break;
2068         case DPSOFTRAST_BLENDMODE_ADD:
2069                 for (x = startx;x < endx;x++)
2070                 {
2071                         if (!pixelmask[x])
2072                                 continue;
2073                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2074                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2075                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2076                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2077                         pixel[x*4+0] = d[0];
2078                         pixel[x*4+1] = d[1];
2079                         pixel[x*4+2] = d[2];
2080                         pixel[x*4+3] = d[3];
2081                 }
2082                 break;
2083         case DPSOFTRAST_BLENDMODE_INVMOD:
2084                 for (x = startx;x < endx;x++)
2085                 {
2086                         if (!pixelmask[x])
2087                                 continue;
2088                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2089                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2090                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2091                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2092                         pixel[x*4+0] = d[0];
2093                         pixel[x*4+1] = d[1];
2094                         pixel[x*4+2] = d[2];
2095                         pixel[x*4+3] = d[3];
2096                 }
2097                 break;
2098         case DPSOFTRAST_BLENDMODE_MUL:
2099                 for (x = startx;x < endx;x++)
2100                 {
2101                         if (!pixelmask[x])
2102                                 continue;
2103                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2104                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2105                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2106                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2107                         pixel[x*4+0] = d[0];
2108                         pixel[x*4+1] = d[1];
2109                         pixel[x*4+2] = d[2];
2110                         pixel[x*4+3] = d[3];
2111                 }
2112                 break;
2113         case DPSOFTRAST_BLENDMODE_MUL2:
2114                 for (x = startx;x < endx;x++)
2115                 {
2116                         if (!pixelmask[x])
2117                                 continue;
2118                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2119                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2120                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2121                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2122                         pixel[x*4+0] = d[0];
2123                         pixel[x*4+1] = d[1];
2124                         pixel[x*4+2] = d[2];
2125                         pixel[x*4+3] = d[3];
2126                 }
2127                 break;
2128         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2129                 for (x = startx;x < endx;x++)
2130                 {
2131                         if (!pixelmask[x])
2132                                 continue;
2133                         a = in4f[x*4+3] * -255.0f;
2134                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2135                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2136                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2137                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2138                         pixel[x*4+0] = d[0];
2139                         pixel[x*4+1] = d[1];
2140                         pixel[x*4+2] = d[2];
2141                         pixel[x*4+3] = d[3];
2142                 }
2143                 break;
2144         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2145                 for (x = startx;x < endx;x++)
2146                 {
2147                         if (!pixelmask[x])
2148                                 continue;
2149                         a = 255.0f;
2150                         b = 1.0f - in4f[x*4+3];
2151                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2152                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2153                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2154                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2155                         pixel[x*4+0] = d[0];
2156                         pixel[x*4+1] = d[1];
2157                         pixel[x*4+2] = d[2];
2158                         pixel[x*4+3] = d[3];
2159                 }
2160                 break;
2161         case DPSOFTRAST_BLENDMODE_INVADD:
2162                 for (x = startx;x < endx;x++)
2163                 {
2164                         if (!pixelmask[x])
2165                                 continue;
2166                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2167                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2168                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2169                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2170                         pixel[x*4+0] = d[0];
2171                         pixel[x*4+1] = d[1];
2172                         pixel[x*4+2] = d[2];
2173                         pixel[x*4+3] = d[3];
2174                 }
2175                 break;
2176         }
2177 }
2178
2179 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2180 {
2181 #ifdef SSE2_PRESENT
2182         int x;
2183         int startx = span->startx;
2184         int endx = span->endx;
2185         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2186         unsigned char * RESTRICT pixelmask = span->pixelmask;
2187         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2188         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2189         if (!pixel)
2190                 return;
2191         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2192         pixeli += span->y * dpsoftrast.fb_width + span->x;
2193         // handle alphatest now (this affects depth writes too)
2194         if (thread->alphatest)
2195                 for (x = startx;x < endx;x++)
2196                         if (in4ub[x*4+3] < 0.5f)
2197                                 pixelmask[x] = false;
2198         // FIXME: this does not handle bigendian
2199         switch(thread->fb_blendmode)
2200         {
2201         case DPSOFTRAST_BLENDMODE_OPAQUE:
2202                 for (x = startx;x + 4 <= endx;)
2203                 {
2204                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2205                         {
2206                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2207                                 x += 4;
2208                         }
2209                         else
2210                         {
2211                                 if (pixelmask[x])
2212                                         pixeli[x] = ini[x];
2213                                 x++;
2214                         }
2215                 }
2216                 for (;x < endx;x++)
2217                         if (pixelmask[x])
2218                                 pixeli[x] = ini[x];
2219                 break;
2220         case DPSOFTRAST_BLENDMODE_ALPHA:
2221         #define FINISHBLEND(blend2, blend1) \
2222                 for (x = startx;x + 1 < endx;x += 2) \
2223                 { \
2224                         __m128i src, dst; \
2225                         switch (*(const unsigned short*)&pixelmask[x]) \
2226                         { \
2227                         case 0x0101: \
2228                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2229                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2230                                 blend2; \
2231                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2232                                 continue; \
2233                         case 0x0100: \
2234                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2235                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2236                                 blend1; \
2237                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2238                                 continue; \
2239                         case 0x0001: \
2240                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2241                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2242                                 blend1; \
2243                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2244                                 continue; \
2245                         } \
2246                         break; \
2247                 } \
2248                 for(;x < endx; x++) \
2249                 { \
2250                         __m128i src, dst; \
2251                         if (!pixelmask[x]) \
2252                                 continue; \
2253                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2254                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2255                         blend1; \
2256                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2257                 }
2258
2259                 FINISHBLEND({
2260                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2261                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2262                 }, {
2263                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2264                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2265                 });
2266                 break;
2267         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2268                 FINISHBLEND({
2269                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2270                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2271                 }, {
2272                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2273                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2274                 });
2275                 break;
2276         case DPSOFTRAST_BLENDMODE_ADD:
2277                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2278                 break;
2279         case DPSOFTRAST_BLENDMODE_INVMOD:
2280                 FINISHBLEND({
2281                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2282                 }, {
2283                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2284                 });
2285                 break;
2286         case DPSOFTRAST_BLENDMODE_MUL:
2287                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2288                 break;
2289         case DPSOFTRAST_BLENDMODE_MUL2:
2290                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2291                 break;
2292         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2293                 FINISHBLEND({
2294                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2295                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2296                 }, {
2297                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2298                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2299                 });
2300                 break;
2301         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2302                 FINISHBLEND({
2303                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2304                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2305                 }, {
2306                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2307                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2308                 });
2309                 break;
2310         case DPSOFTRAST_BLENDMODE_INVADD:
2311                 FINISHBLEND({
2312                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2313                 }, {
2314                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2315                 });
2316                 break;
2317         }
2318 #endif
2319 }
2320
2321 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2322 {
2323         int x;
2324         int startx = span->startx;
2325         int endx = span->endx;
2326         int flags;
2327         float c[4];
2328         float data[4];
2329         float slope[4];
2330         float tc[2], endtc[2];
2331         float tcscale[2];
2332         unsigned int tci[2];
2333         unsigned int tci1[2];
2334         unsigned int tcimin[2];
2335         unsigned int tcimax[2];
2336         int tciwrapmask[2];
2337         int tciwidth;
2338         int filter;
2339         int mip;
2340         const unsigned char * RESTRICT pixelbase;
2341         const unsigned char * RESTRICT pixel[4];
2342         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2343         // if no texture is bound, just fill it with white
2344         if (!texture)
2345         {
2346                 for (x = startx;x < endx;x++)
2347                 {
2348                         out4f[x*4+0] = 1.0f;
2349                         out4f[x*4+1] = 1.0f;
2350                         out4f[x*4+2] = 1.0f;
2351                         out4f[x*4+3] = 1.0f;
2352                 }
2353                 return;
2354         }
2355         mip = triangle->mip[texunitindex];
2356         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2357         // if this mipmap of the texture is 1 pixel, just fill it with that color
2358         if (texture->mipmap[mip][1] == 4)
2359         {
2360                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2361                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2362                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2363                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2364                 for (x = startx;x < endx;x++)
2365                 {
2366                         out4f[x*4+0] = c[0];
2367                         out4f[x*4+1] = c[1];
2368                         out4f[x*4+2] = c[2];
2369                         out4f[x*4+3] = c[3];
2370                 }
2371                 return;
2372         }
2373         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2374         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2375         flags = texture->flags;
2376         tcscale[0] = texture->mipmap[mip][2];
2377         tcscale[1] = texture->mipmap[mip][3];
2378         tciwidth = texture->mipmap[mip][2];
2379         tcimin[0] = 0;
2380         tcimin[1] = 0;
2381         tcimax[0] = texture->mipmap[mip][2]-1;
2382         tcimax[1] = texture->mipmap[mip][3]-1;
2383         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2384         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2385         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2386         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2387         for (x = startx;x < endx;)
2388         {
2389                 unsigned int subtc[2];
2390                 unsigned int substep[2];
2391                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2392                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2393                 if (nextsub >= endx)
2394                 {
2395                         nextsub = endsub = endx-1;      
2396                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2397                 }
2398                 tc[0] = endtc[0];
2399                 tc[1] = endtc[1];
2400                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2401                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2402                 substep[0] = (endtc[0] - tc[0]) * subscale;
2403                 substep[1] = (endtc[1] - tc[1]) * subscale;
2404                 subtc[0] = tc[0] * (1<<16);
2405                 subtc[1] = tc[1] * (1<<16);
2406                 if (filter)
2407                 {
2408                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2409                         {
2410                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2411                                 {
2412                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2413                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2414                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2415                                         tci[0] = subtc[0]>>16;
2416                                         tci[1] = subtc[1]>>16;
2417                                         tci1[0] = tci[0] + 1;
2418                                         tci1[1] = tci[1] + 1;
2419                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2420                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2421                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2422                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2423                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2424                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2425                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2426                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2427                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2428                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2429                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2430                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2431                                         out4f[x*4+0] = c[0];
2432                                         out4f[x*4+1] = c[1];
2433                                         out4f[x*4+2] = c[2];
2434                                         out4f[x*4+3] = c[3];
2435                                 }
2436                         }
2437                         else
2438                         {
2439                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2440                                 {
2441                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2442                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2443                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2444                                         tci[0] = subtc[0]>>16;
2445                                         tci[1] = subtc[1]>>16;
2446                                         tci1[0] = tci[0] + 1;
2447                                         tci1[1] = tci[1] + 1;
2448                                         tci[0] &= tciwrapmask[0];
2449                                         tci[1] &= tciwrapmask[1];
2450                                         tci1[0] &= tciwrapmask[0];
2451                                         tci1[1] &= tciwrapmask[1];
2452                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2453                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2454                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2455                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2456                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2457                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2458                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2459                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2460                                         out4f[x*4+0] = c[0];
2461                                         out4f[x*4+1] = c[1];
2462                                         out4f[x*4+2] = c[2];
2463                                         out4f[x*4+3] = c[3];
2464                                 }
2465                         }
2466                 }
2467                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2468                 {
2469                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2470                         {
2471                                 tci[0] = subtc[0]>>16;
2472                                 tci[1] = subtc[1]>>16;
2473                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2474                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2475                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2476                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2477                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2478                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2479                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2480                                 out4f[x*4+0] = c[0];
2481                                 out4f[x*4+1] = c[1];
2482                                 out4f[x*4+2] = c[2];
2483                                 out4f[x*4+3] = c[3];
2484                         }
2485                 }
2486                 else
2487                 {
2488                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2489                         {
2490                                 tci[0] = subtc[0]>>16;
2491                                 tci[1] = subtc[1]>>16;
2492                                 tci[0] &= tciwrapmask[0];
2493                                 tci[1] &= tciwrapmask[1];
2494                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2495                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2496                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2497                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2498                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2499                                 out4f[x*4+0] = c[0];
2500                                 out4f[x*4+1] = c[1];
2501                                 out4f[x*4+2] = c[2];
2502                                 out4f[x*4+3] = c[3];
2503                         }
2504                 }
2505         }
2506 }
2507
2508 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2509 {
2510 #ifdef SSE2_PRESENT
2511         int x;
2512         int startx = span->startx;
2513         int endx = span->endx;
2514         int flags;
2515         __m128 data, slope, tcscale;
2516         __m128i tcsize, tcmask, tcoffset, tcmax;
2517         __m128 tc, endtc;
2518         __m128i subtc, substep, endsubtc;
2519         int filter;
2520         int mip;
2521         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2522         const unsigned char * RESTRICT pixelbase;
2523         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2524         // if no texture is bound, just fill it with white
2525         if (!texture)
2526         {
2527                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2528                 return;
2529         }
2530         mip = triangle->mip[texunitindex];
2531         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2532         // if this mipmap of the texture is 1 pixel, just fill it with that color
2533         if (texture->mipmap[mip][1] == 4)
2534         {
2535                 unsigned int k = *((const unsigned int *)pixelbase);
2536                 for (x = startx;x < endx;x++)
2537                         outi[x] = k;
2538                 return;
2539         }
2540         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2541         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2542         flags = texture->flags;
2543         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2544         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2545         tcscale = _mm_cvtepi32_ps(tcsize);
2546         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2547         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2548         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2549         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2550         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2551         tcmax = _mm_packs_epi32(tcmask, tcmask);
2552         for (x = startx;x < endx;)
2553         {
2554                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2555                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2556                 if (nextsub >= endx)
2557                 {
2558                         nextsub = endsub = endx-1;
2559                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2560                 }       
2561                 tc = endtc;
2562                 subtc = endsubtc;
2563                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2564                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2565                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2566                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2567                 substep = _mm_slli_epi32(substep, 1);
2568                 if (filter)
2569                 {
2570                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2571                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2572                         {
2573                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2574                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2575                                 {
2576                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2577                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2578                                         tci = _mm_madd_epi16(tci, tcoffset);
2579                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2580                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2581                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2582                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2583                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2584                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2585                                         fracm = _mm_srli_epi16(subtc, 1);
2586                                         pix1 = _mm_add_epi16(pix1,
2587                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2588                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2589                                         pix3 = _mm_add_epi16(pix3,
2590                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2591                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2592                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2593                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2594                                         pix2 = _mm_add_epi16(pix2,
2595                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2596                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2597                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2598                                 }
2599                                 if (x <= endsub)
2600                                 {
2601                                         const unsigned char * RESTRICT ptr1;
2602                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2603                                         tci = _mm_madd_epi16(tci, tcoffset);
2604                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2605                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2606                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2607                                         fracm = _mm_srli_epi16(subtc, 1);
2608                                         pix1 = _mm_add_epi16(pix1,
2609                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2610                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2611                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2612                                         pix1 = _mm_add_epi16(pix1,
2613                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2614                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2615                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2616                                         x++;
2617                                 }
2618                         }
2619                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2620                         {
2621                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2622                                 {
2623                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2624                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2625                                         tci = _mm_madd_epi16(tci, tcoffset);
2626                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2627                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2628                                                                                         _mm_setzero_si128());
2629                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2630                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2631                                                                                         _mm_setzero_si128());
2632                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2633                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2634                                         tci = _mm_madd_epi16(tci, tcoffset);
2635                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2636                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2637                                                                                         _mm_setzero_si128());
2638                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2639                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2640                                                                                         _mm_setzero_si128());
2641                                         fracm = _mm_srli_epi16(subtc, 1);
2642                                         pix1 = _mm_add_epi16(pix1,
2643                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2644                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2645                                         pix3 = _mm_add_epi16(pix3,
2646                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2647                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2648                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2649                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2650                                         pix2 = _mm_add_epi16(pix2,
2651                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2652                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2653                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2654                                 }
2655                                 if (x <= endsub)
2656                                 {
2657                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2658                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2659                                         tci = _mm_madd_epi16(tci, tcoffset);
2660                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2661                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2662                                                                                         _mm_setzero_si128());
2663                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2664                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2665                                                                                         _mm_setzero_si128());
2666                                         fracm = _mm_srli_epi16(subtc, 1);
2667                                         pix1 = _mm_add_epi16(pix1,
2668                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2669                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2670                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2671                                         pix1 = _mm_add_epi16(pix1,
2672                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2673                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2674                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2675                                         x++;
2676                                 }
2677                         }
2678                         else
2679                         {
2680                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2681                                 {
2682                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2683                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2684                                         tci = _mm_madd_epi16(tci, tcoffset);
2685                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2686                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2687                                                                                         _mm_setzero_si128());
2688                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2689                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2690                                                                                         _mm_setzero_si128());
2691                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2692                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2693                                         tci = _mm_madd_epi16(tci, tcoffset);
2694                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2695                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2696                                                                                         _mm_setzero_si128());
2697                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2698                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2699                                                                                         _mm_setzero_si128());
2700                                         fracm = _mm_srli_epi16(subtc, 1);
2701                                         pix1 = _mm_add_epi16(pix1,
2702                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2703                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2704                                         pix3 = _mm_add_epi16(pix3,
2705                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2706                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2707                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2708                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2709                                         pix2 = _mm_add_epi16(pix2,
2710                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2711                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2712                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2713                                 }
2714                                 if (x <= endsub)
2715                                 {
2716                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2717                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2718                                         tci = _mm_madd_epi16(tci, tcoffset);
2719                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2720                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2721                                                                                         _mm_setzero_si128());
2722                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2723                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2724                                                                                         _mm_setzero_si128());
2725                                         fracm = _mm_srli_epi16(subtc, 1);
2726                                         pix1 = _mm_add_epi16(pix1,
2727                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2728                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2729                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2730                                         pix1 = _mm_add_epi16(pix1,
2731                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2732                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2733                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2734                                         x++;
2735                                 }
2736                         }
2737                 }
2738                 else
2739                 {
2740                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2741                         {
2742                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2743                                 {
2744                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2745                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2746                                         tci = _mm_madd_epi16(tci, tcoffset);
2747                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2748                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2749                                 }
2750                                 if (x <= endsub)
2751                                 {
2752                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2753                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2754                                         tci = _mm_madd_epi16(tci, tcoffset);
2755                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2756                                         x++;
2757                                 }
2758                         }
2759                         else
2760                         {
2761                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2762                                 {
2763                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2764                                         tci = _mm_and_si128(tci, tcmax); 
2765                                         tci = _mm_madd_epi16(tci, tcoffset);
2766                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2767                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2768                                 }
2769                                 if (x <= endsub)
2770                                 {
2771                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2772                                         tci = _mm_and_si128(tci, tcmax); 
2773                                         tci = _mm_madd_epi16(tci, tcoffset);
2774                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2775                                         x++;
2776                                 }
2777                         }
2778                 }
2779         }
2780 #endif
2781 }
2782
2783 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2784 {
2785         // TODO: IMPLEMENT
2786         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2787 }
2788
2789 float DPSOFTRAST_SampleShadowmap(const float *vector)
2790 {
2791         // TODO: IMPLEMENT
2792         return 1.0f;
2793 }
2794
2795 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2796 {
2797         int x;
2798         int startx = span->startx;
2799         int endx = span->endx;
2800         float c[4];
2801         float data[4];
2802         float slope[4];
2803         float z;
2804         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2805         for (x = startx;x < endx;x++)
2806         {
2807                 z = zf[x];
2808                 c[0] = (data[0] + slope[0]*x) * z;
2809                 c[1] = (data[1] + slope[1]*x) * z;
2810                 c[2] = (data[2] + slope[2]*x) * z;
2811                 c[3] = (data[3] + slope[3]*x) * z;
2812                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2813                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2814                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2815                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2816         }
2817 }
2818
2819 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2820 {
2821         int x;
2822         int startx = span->startx;
2823         int endx = span->endx;
2824         float c[4];
2825         float data[4];
2826         float slope[4];
2827         float z;
2828         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2829         for (x = startx;x < endx;x++)
2830         {
2831                 z = zf[x];
2832                 c[0] = (data[0] + slope[0]*x) * z;
2833                 c[1] = (data[1] + slope[1]*x) * z;
2834                 c[2] = (data[2] + slope[2]*x) * z;
2835                 c[3] = (data[3] + slope[3]*x) * z;
2836                 out4f[x*4+0] = c[0];
2837                 out4f[x*4+1] = c[1];
2838                 out4f[x*4+2] = c[2];
2839                 out4f[x*4+3] = c[3];
2840         }
2841 }
2842
2843 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2844 {
2845         int x, startx = span->startx, endx = span->endx;
2846         float c[4], localcolor[4];
2847         localcolor[0] = subcolor[0];
2848         localcolor[1] = subcolor[1];
2849         localcolor[2] = subcolor[2];
2850         localcolor[3] = subcolor[3];
2851         for (x = startx;x < endx;x++)
2852         {
2853                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2854                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2855                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2856                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2857                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2858                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2859                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2860                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2861         }
2862 }
2863
2864 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2865 {
2866         int x, startx = span->startx, endx = span->endx;
2867         for (x = startx;x < endx;x++)
2868         {
2869                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2870                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2871                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2872                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2873         }
2874 }
2875
2876 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2877 {
2878         int x, startx = span->startx, endx = span->endx;
2879         for (x = startx;x < endx;x++)
2880         {
2881                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2882                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2883                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2884                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2885         }
2886 }
2887
2888 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2889 {
2890         int x, startx = span->startx, endx = span->endx;
2891         float a, b;
2892         for (x = startx;x < endx;x++)
2893         {
2894                 a = 1.0f - inb4f[x*4+3];
2895                 b = inb4f[x*4+3];
2896                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2897                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2898                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2899                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2900         }
2901 }
2902
2903 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2904 {
2905         int x, startx = span->startx, endx = span->endx;
2906         float localcolor[4], ilerp, lerp;
2907         localcolor[0] = color[0];
2908         localcolor[1] = color[1];
2909         localcolor[2] = color[2];
2910         localcolor[3] = color[3];
2911         ilerp = 1.0f - localcolor[3];
2912         lerp = localcolor[3];
2913         for (x = startx;x < endx;x++)
2914         {
2915                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2916                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2917                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2918                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2919         }
2920 }
2921
2922
2923
2924 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2925 {
2926 #ifdef SSE2_PRESENT
2927         int x;
2928         int startx = span->startx;
2929         int endx = span->endx;
2930         __m128 data, slope;
2931         __m128 mod, endmod;
2932         __m128i submod, substep, endsubmod;
2933         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2934         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2935         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2936         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2937         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2938         for (x = startx; x < endx;)
2939         {
2940                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2941                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2942                 if (nextsub >= endx)
2943                 {
2944                         nextsub = endsub = endx-1;
2945                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2946                 }
2947                 mod = endmod;
2948                 submod = endsubmod;
2949                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2950                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2951                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2952                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2953                 substep = _mm_packs_epi32(substep, substep);
2954                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2955                 {
2956                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2957                         pix = _mm_mulhi_epu16(pix, submod);
2958                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2959                 }
2960                 if (x <= endsub)
2961                 {
2962                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2963                         pix = _mm_mulhi_epu16(pix, submod);
2964                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2965                         x++;
2966                 }
2967         }
2968 #endif
2969 }
2970
2971 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2972 {
2973 #ifdef SSE2_PRESENT
2974         int x;
2975         int startx = span->startx;
2976         int endx = span->endx;
2977         __m128 data, slope;
2978         __m128 mod, endmod;
2979         __m128i submod, substep, endsubmod;
2980         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2981         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2982         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2983         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2984         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2985         for (x = startx; x < endx;)
2986         {
2987                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2988                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2989                 if (nextsub >= endx)
2990                 {
2991                         nextsub = endsub = endx-1;
2992                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2993                 }
2994                 mod = endmod;
2995                 submod = endsubmod;
2996                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2997                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2998                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2999                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3000                 substep = _mm_packs_epi32(substep, substep);
3001                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3002                 {
3003                         __m128i pix = _mm_srai_epi16(submod, 4);
3004                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3005                 }
3006                 if (x <= endsub)
3007                 {
3008                         __m128i pix = _mm_srai_epi16(submod, 4);
3009                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3010                         x++;
3011                 }
3012         }
3013 #endif
3014 }
3015
3016 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3017 {
3018 #ifdef SSE2_PRESENT
3019         int x, startx = span->startx, endx = span->endx;
3020         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3021         localcolor = _mm_packs_epi32(localcolor, localcolor);
3022         for (x = startx;x+2 <= endx;x+=2)
3023         {
3024                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3025                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3026                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3027                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3028         }
3029         if (x < endx)
3030         {
3031                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3032                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3033                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3034                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3035         }
3036 #endif
3037 }
3038
3039 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3040 {
3041 #ifdef SSE2_PRESENT
3042         int x, startx = span->startx, endx = span->endx;
3043         for (x = startx;x+2 <= endx;x+=2)
3044         {
3045                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3046                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3047                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3048                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3049         }
3050         if (x < endx)
3051         {
3052                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3053                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3054                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3055                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3056         }
3057 #endif
3058 }
3059
3060 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3061 {
3062 #ifdef SSE2_PRESENT
3063         int x, startx = span->startx, endx = span->endx;
3064         for (x = startx;x+2 <= endx;x+=2)
3065         {
3066                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3067                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3068                 pix1 = _mm_add_epi16(pix1, pix2);
3069                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3070         }
3071         if (x < endx)
3072         {
3073                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3074                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3075                 pix1 = _mm_add_epi16(pix1, pix2);
3076                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3077         }
3078 #endif
3079 }
3080
3081 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3082 {
3083 #ifdef SSE2_PRESENT
3084         int x, startx = span->startx, endx = span->endx;
3085         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3086         tint = _mm_packs_epi32(tint, tint);
3087         for (x = startx;x+2 <= endx;x+=2)
3088         {
3089                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3090                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3091                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3092                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3093         }
3094         if (x < endx)
3095         {
3096                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3097                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3098                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3099                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3100         }
3101 #endif
3102 }
3103
3104 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3105 {
3106 #ifdef SSE2_PRESENT
3107         int x, startx = span->startx, endx = span->endx;
3108         for (x = startx;x+2 <= endx;x+=2)
3109         {
3110                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3111                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3112                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3113                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3114                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3115         }
3116         if (x < endx)
3117         {
3118                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3119                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3120                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3121                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3122                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3123         }
3124 #endif
3125 }
3126
3127 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3128 {
3129 #ifdef SSE2_PRESENT
3130         int x, startx = span->startx, endx = span->endx;
3131         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3132         localcolor = _mm_packs_epi32(localcolor, localcolor);
3133         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3134         for (x = startx;x+2 <= endx;x+=2)
3135         {
3136                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3137                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3138                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3139         }
3140         if (x < endx)
3141         {
3142                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3143                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3144                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3145         }
3146 #endif
3147 }
3148
3149
3150
3151 void DPSOFTRAST_VertexShader_Generic(void)
3152 {
3153         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3154         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3155         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3156         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3157                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3158 }
3159
3160 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3161 {
3162         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3163         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3164         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3165         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3166         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3167         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3168         {
3169                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3170                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3171                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3172                 {
3173                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3174                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3175                         {
3176                                 // multiply
3177                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3178                         }
3179                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3180                         {
3181                                 // add
3182                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3183                         }
3184                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3185                         {
3186                                 // alphablend
3187                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3188                         }
3189                 }
3190         }
3191         else
3192                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3193         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3194 }
3195
3196
3197
3198 void DPSOFTRAST_VertexShader_PostProcess(void)
3199 {
3200         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3201         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3202         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3203 }
3204
3205 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3206 {
3207         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3208         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3209         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3210         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3211         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3212         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3213         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3214         {
3215                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3216                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3217         }
3218         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3219         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3220         {
3221                 // TODO: implement saturation
3222         }
3223         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3224         {
3225                 // TODO: implement gammaramps
3226         }
3227         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3228 }
3229
3230
3231
3232 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3233 {
3234         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3235 }
3236
3237 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3238 {
3239         // this is never called (because colormask is off when this shader is used)
3240         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3241         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3242         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3243         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3244         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3245 }
3246
3247
3248
3249 void DPSOFTRAST_VertexShader_FlatColor(void)
3250 {
3251         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3252         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3253 }
3254
3255 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3256 {
3257 #ifdef SSE2_PRESENT
3258         unsigned char * RESTRICT pixelmask = span->pixelmask;
3259         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3260         int x, startx = span->startx, endx = span->endx;
3261         __m128i Color_Ambientm;
3262         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3263         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3264         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3265         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3266         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3267         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3268                 pixel = buffer_FragColorbgra8;
3269         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3270         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3271         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3272         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3273         for (x = startx;x < endx;x++)
3274         {
3275                 __m128i color, pix;
3276                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3277                 {
3278                         __m128i pix2;
3279                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3280                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3281                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3282                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3283                         x += 3;
3284                         continue;
3285                 }
3286                 if (!pixelmask[x])
3287                         continue;
3288                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3289                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3290                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3291         }
3292         if (pixel == buffer_FragColorbgra8)
3293                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3294 #endif
3295 }
3296
3297
3298
3299 void DPSOFTRAST_VertexShader_VertexColor(void)
3300 {
3301         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3302         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3303         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3304 }
3305
3306 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3307 {
3308 #ifdef SSE2_PRESENT
3309         unsigned char * RESTRICT pixelmask = span->pixelmask;
3310         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3311         int x, startx = span->startx, endx = span->endx;
3312         __m128i Color_Ambientm, Color_Diffusem;
3313         __m128 data, slope;
3314         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3315         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3316         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3317         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3318         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3319         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3320         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3321                 pixel = buffer_FragColorbgra8;
3322         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3323         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3324         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3325         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3326         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3327         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3328         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3329         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3330         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3331         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3332         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3333         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3334         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3335         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3336         {
3337                 __m128i color, mod, pix;
3338                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3339                 {
3340                         __m128i pix2, mod2;
3341                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3342                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3343                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3344                         data = _mm_add_ps(data, slope);
3345                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3346                         data = _mm_add_ps(data, slope);
3347                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3348                         data = _mm_add_ps(data, slope);
3349                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3350                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3351                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3352                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3353                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3354                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3355                         x += 3;
3356                         continue;
3357                 }
3358                 if (!pixelmask[x])
3359                         continue;
3360                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3361                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3362                 mod = _mm_packs_epi32(mod, mod);
3363                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3364                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3365         }
3366         if (pixel == buffer_FragColorbgra8)
3367                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3368 #endif
3369 }
3370
3371
3372
3373 void DPSOFTRAST_VertexShader_Lightmap(void)
3374 {
3375         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3376         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3377         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3378 }
3379
3380 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3381 {
3382 #ifdef SSE2_PRESENT
3383         unsigned char * RESTRICT pixelmask = span->pixelmask;
3384         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3385         int x, startx = span->startx, endx = span->endx;
3386         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3387         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3388         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3389         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3390         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3391         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3392         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3393         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3394         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3395         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3396                 pixel = buffer_FragColorbgra8;
3397         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3398         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3399         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3400         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3401         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3402         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3403         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3404         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3405         {
3406                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3407                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3408                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3409                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3410                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3411                 for (x = startx;x < endx;x++)
3412                 {
3413                         __m128i color, lightmap, glow, pix;
3414                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3415                         {
3416                                 __m128i pix2;
3417                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3418                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3419                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3420                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3421                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3422                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3423                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3424                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3425                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3426                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3427                                 x += 3;
3428                                 continue;
3429                         }
3430                         if (!pixelmask[x])
3431                                 continue;
3432                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3433                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3434                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3435                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3436                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3437                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3438                 }
3439         }
3440         else
3441         {
3442                 for (x = startx;x < endx;x++)
3443                 {
3444                         __m128i color, lightmap, pix;
3445                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3446                         {
3447                                 __m128i pix2;
3448                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3449                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3450                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3451                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3452                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3453                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3454                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3455                                 x += 3;
3456                                 continue;
3457                         }
3458                         if (!pixelmask[x]) 
3459                                 continue;
3460                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3461                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3462                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3463                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3464                 }
3465         }
3466         if (pixel == buffer_FragColorbgra8)
3467                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3468 #endif
3469 }
3470
3471
3472 void DPSOFTRAST_VertexShader_LightDirection(void);
3473 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3474
3475 void DPSOFTRAST_VertexShader_FakeLight(void)
3476 {
3477         DPSOFTRAST_VertexShader_LightDirection();
3478 }
3479
3480 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3481 {
3482         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3483 }
3484
3485
3486
3487 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3488 {
3489         DPSOFTRAST_VertexShader_LightDirection();
3490         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3491 }
3492
3493 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3494 {
3495         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3496 }
3497
3498
3499
3500 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3501 {
3502         DPSOFTRAST_VertexShader_LightDirection();
3503         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3504 }
3505
3506 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3507 {
3508         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3509 }
3510
3511
3512
3513 void DPSOFTRAST_VertexShader_LightDirection(void)
3514 {
3515         int i;
3516         int numvertices = dpsoftrast.numvertices;
3517         float LightDir[4];
3518         float LightVector[4];
3519         float EyePosition[4];
3520         float EyeVectorModelSpace[4];
3521         float EyeVector[4];
3522         float position[4];
3523         float svector[4];
3524         float tvector[4];
3525         float normal[4];
3526         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3527         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3528         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3529         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3530         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3531         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3532         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3533         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3534         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3535         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3536         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3537         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3538         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3539         for (i = 0;i < numvertices;i++)
3540         {
3541                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3542                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3543                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3544                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3545                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3546                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3547                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3548                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3549                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3550                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3551                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3552                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3553                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3554                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3555                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3556                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3557                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3558                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3559                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3560                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3561                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3562                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3563                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3564                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3565                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3566                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3567                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3568                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3569                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3570         }
3571         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3572 }
3573
3574 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3575 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3576 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3577 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3578 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3579 #define DPSOFTRAST_Vector3Normalize(v)\
3580 do\
3581 {\
3582         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3583         if (len)\
3584         {\
3585                 len = 1.0f / len;\
3586                 v[0] *= len;\
3587                 v[1] *= len;\
3588                 v[2] *= len;\
3589         }\
3590 }\
3591 while(0)
3592
3593 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3594 {
3595         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3596         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3597         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3598         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3599         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3600         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3601         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3602         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3603         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3604         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3605         int x, startx = span->startx, endx = span->endx;
3606         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3607         float LightVectordata[4];
3608         float LightVectorslope[4];
3609         float EyeVectordata[4];
3610         float EyeVectorslope[4];
3611         float VectorSdata[4];
3612         float VectorSslope[4];
3613         float VectorTdata[4];
3614         float VectorTslope[4];
3615         float VectorRdata[4];
3616         float VectorRslope[4];
3617         float z;
3618         float diffusetex[4];
3619         float glosstex[4];
3620         float surfacenormal[4];
3621         float lightnormal[4];
3622         float lightnormal_modelspace[4];
3623         float eyenormal[4];
3624         float specularnormal[4];
3625         float diffuse;
3626         float specular;
3627         float SpecularPower;
3628         int d[4];
3629         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3630         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3631         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3632         Color_Glow[3] = 0.0f;
3633         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3634         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3635         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3636         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3637         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3638         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3639         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3640         Color_Pants[3] = 0.0f;
3641         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3642         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3643         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3644         Color_Shirt[3] = 0.0f;
3645         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3646         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3647         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3648         {
3649                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3650                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3651         }
3652         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3653         {
3654                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3655         }
3656         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3657         {
3658                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3659                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3660                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3661                 Color_Diffuse[3] = 0.0f;
3662                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3663                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3664                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3665                 LightColor[3] = 0.0f;
3666                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3667                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3668                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3669                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3670                 Color_Specular[3] = 0.0f;
3671                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3672                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3673                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3674
3675                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3676                 {
3677                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3678                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3679                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3680                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3681                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3682                 }
3683                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3684                 {
3685                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3686                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3687                 }
3688                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3689                 {
3690                         // nothing of this needed
3691                 }
3692                 else
3693                 {
3694                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3695                 }
3696
3697                 for (x = startx;x < endx;x++)
3698                 {
3699                         z = buffer_z[x];
3700                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3701                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3702                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3703                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3704                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3705                         {
3706                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3707                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3708                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3709                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3710                         }
3711                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3712                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3713                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3714                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3715                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3716                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3717                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3718                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3719
3720                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3721                         {
3722                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3723                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3724                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3725                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3726
3727                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3728                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3729                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3730                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3731
3732                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3733                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3734                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3735                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3736
3737                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3738                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3739                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3740                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3741
3742                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3743                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3744
3745                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3746                                 {
3747                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3748                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3749                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3750                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3751                                 }
3752                         }
3753                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3754                         {
3755                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3756                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3757                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3758                                 {
3759                                         float f = 1.0f / 256.0f;
3760                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3761                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3762                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3763                                 }
3764                         }
3765                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3766                         {
3767                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3768                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3769                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3770                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3771
3772                                 LightColor[0] = 1.0;
3773                                 LightColor[1] = 1.0;
3774                                 LightColor[2] = 1.0;
3775                         }
3776                         else
3777                         {
3778                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3779                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3780                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3781                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3782                         }
3783
3784                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3785
3786                         if(thread->shader_exactspecularmath)
3787                         {
3788                                 // reflect lightnormal at surfacenormal, take the negative of that
3789                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3790                                 float f;
3791                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3792                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3793                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3794                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3795
3796                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3797                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3798                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3799                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3800                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3801
3802                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3803                         }
3804                         else
3805                         {
3806                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3807                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3808                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3809                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3810
3811                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3812                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3813                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3814                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3815
3816                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3817                         }
3818
3819                         specular = pow(specular, SpecularPower * glosstex[3]);
3820                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3821                         {
3822                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3823                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3824                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3825                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3826                         }
3827                         else
3828                         {
3829                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3830                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3831                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3832                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3833                         }
3834
3835                         buffer_FragColorbgra8[x*4+0] = d[0];
3836                         buffer_FragColorbgra8[x*4+1] = d[1];
3837                         buffer_FragColorbgra8[x*4+2] = d[2];
3838                         buffer_FragColorbgra8[x*4+3] = d[3];
3839                 }
3840         }
3841         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3842         {
3843                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3844                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3845                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3846                 Color_Diffuse[3] = 0.0f;
3847                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3848                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3849                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3850                 LightColor[3] = 0.0f;
3851                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3852
3853                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3854                 {
3855                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3856                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3857                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3858                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3859                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3860                 }
3861                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3862                 {
3863                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3864                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3865                 }
3866                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3867                 {
3868                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3869                 }
3870                 else
3871                 {
3872                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3873                 }
3874
3875                 for (x = startx;x < endx;x++)
3876                 {
3877                         z = buffer_z[x];
3878                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3879                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3880                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3881                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3882                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3883                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3884                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3885                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3886
3887                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3888                         {
3889                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3890                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3891                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3892                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3893
3894                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3895                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3896                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3897                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3898
3899                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3900                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3901                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3902                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3903
3904                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3905                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3906                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3907                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3908
3909                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3910                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3911
3912                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3913                                 {
3914                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3915                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3916                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3917                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3918                                 }
3919                         }
3920                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3921                         {
3922                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3923                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3924                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3925                                 {
3926                                         float f = 1.0f / 256.0f;
3927                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3928                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3929                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3930                                 }
3931                         }
3932                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3933                         {
3934                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3935                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3936                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3937                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3938
3939                                 LightColor[0] = 1.0;
3940                                 LightColor[1] = 1.0;
3941                                 LightColor[2] = 1.0;
3942                         }
3943                         else
3944                         {
3945                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3946                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3947                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3948                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3949                         }
3950
3951                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3952                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3953                         {
3954                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3955                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3956                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3957                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3958                         }
3959                         else
3960                         {
3961                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3962                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3963                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3964                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3965                         }
3966                         buffer_FragColorbgra8[x*4+0] = d[0];
3967                         buffer_FragColorbgra8[x*4+1] = d[1];
3968                         buffer_FragColorbgra8[x*4+2] = d[2];
3969                         buffer_FragColorbgra8[x*4+3] = d[3];
3970                 }
3971         }
3972         else
3973         {
3974                 for (x = startx;x < endx;x++)
3975                 {
3976                         z = buffer_z[x];
3977                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3978                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3979                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3980                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3981
3982                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3983                         {
3984                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3985                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3986                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3987                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3988                         }
3989                         else
3990                         {
3991                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3992                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3993                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3994                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3995                         }
3996                         buffer_FragColorbgra8[x*4+0] = d[0];
3997                         buffer_FragColorbgra8[x*4+1] = d[1];
3998                         buffer_FragColorbgra8[x*4+2] = d[2];
3999                         buffer_FragColorbgra8[x*4+3] = d[3];
4000                 }
4001         }
4002         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4003 }
4004
4005
4006
4007 void DPSOFTRAST_VertexShader_LightSource(void)
4008 {
4009         int i;
4010         int numvertices = dpsoftrast.numvertices;
4011         float LightPosition[4];
4012         float LightVector[4];
4013         float LightVectorModelSpace[4];
4014         float EyePosition[4];
4015         float EyeVectorModelSpace[4];
4016         float EyeVector[4];
4017         float position[4];
4018         float svector[4];
4019         float tvector[4];
4020         float normal[4];
4021         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4022         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4023         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4024         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4025         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4026         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4027         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4028         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4029         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4030         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4031         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4032         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4033         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4034         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4035         for (i = 0;i < numvertices;i++)
4036         {
4037                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4038                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4039                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4040                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4041                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4042                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4043                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4044                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4045                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4046                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4047                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4048                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4049                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4050                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4051                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4052                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4053                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4054                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4055                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4056                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4057                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4058                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4059                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4060                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4061                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4062                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4063                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4064                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4065                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4066                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4067                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4068                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4069         }
4070         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4071         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4072 }
4073
4074 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4075 {
4076 #ifdef SSE2_PRESENT
4077         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4078         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4079         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4080         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4081         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4082         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4083         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4084         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4085         int x, startx = span->startx, endx = span->endx;
4086         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4087         float CubeVectordata[4];
4088         float CubeVectorslope[4];
4089         float LightVectordata[4];
4090         float LightVectorslope[4];
4091         float EyeVectordata[4];
4092         float EyeVectorslope[4];
4093         float z;
4094         float diffusetex[4];
4095         float glosstex[4];
4096         float surfacenormal[4];
4097         float lightnormal[4];
4098         float eyenormal[4];
4099         float specularnormal[4];
4100         float diffuse;
4101         float specular;
4102         float SpecularPower;
4103         float CubeVector[4];
4104         float attenuation;
4105         int d[4];
4106         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4107         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4108         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4109         Color_Glow[3] = 0.0f;
4110         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4111         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4112         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4113         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4114         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4115         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4116         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4117         Color_Diffuse[3] = 0.0f;
4118         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4119         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4120         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4121         Color_Specular[3] = 0.0f;
4122         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4123         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4124         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4125         Color_Pants[3] = 0.0f;
4126         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4127         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4128         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4129         Color_Shirt[3] = 0.0f;
4130         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4131         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4132         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4133         LightColor[3] = 0.0f;
4134         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4135         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4136         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4137         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4138         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4139         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4140         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4141         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4142         {
4143                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4144                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4145         }
4146         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4147                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4148         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4149         {
4150                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4151                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4152                 for (x = startx;x < endx;x++)
4153                 {
4154                         z = buffer_z[x];
4155                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4156                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4157                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4158                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4159                         if (attenuation < 0.01f)
4160                                 continue;
4161                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4162                         {
4163                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4164                                 if (attenuation < 0.01f)
4165                                         continue;
4166                         }
4167
4168                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4169                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4170                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4171                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4172                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4173                         {
4174                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4175                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4176                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4177                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4178                         }
4179                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4180                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4181                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4182                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4183                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4184                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4185                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4186                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4187
4188                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4189                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4190                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4191                         DPSOFTRAST_Vector3Normalize(lightnormal);
4192
4193                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4194
4195                         if(thread->shader_exactspecularmath)
4196                         {
4197                                 // reflect lightnormal at surfacenormal, take the negative of that
4198                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4199                                 float f;
4200                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4201                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4202                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4203                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4204
4205                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4206                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4207                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4208                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4209                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4210
4211                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4212                         }
4213                         else
4214                         {
4215                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4216                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4217                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4218                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4219
4220                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4221                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4222                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4223                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4224
4225                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4226                         }
4227                         specular = pow(specular, SpecularPower * glosstex[3]);
4228
4229                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4230                         {
4231                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4232                                 attenuation *= (1.0f / 255.0f);
4233                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4234                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4235                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4236                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4237                         }
4238                         else
4239                         {
4240                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4241                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4242                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4243                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4244                         }
4245                         buffer_FragColorbgra8[x*4+0] = d[0];
4246                         buffer_FragColorbgra8[x*4+1] = d[1];
4247                         buffer_FragColorbgra8[x*4+2] = d[2];
4248                         buffer_FragColorbgra8[x*4+3] = d[3];
4249                 }
4250         }
4251         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4252         {
4253                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4254                 for (x = startx;x < endx;x++)
4255                 {
4256                         z = buffer_z[x];
4257                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4258                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4259                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4260                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4261                         if (attenuation < 0.01f)
4262                                 continue;
4263                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4264                         {
4265                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4266                                 if (attenuation < 0.01f)
4267                                         continue;
4268                         }
4269
4270                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4271                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4272                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4273                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4274                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4275                         {
4276                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4277                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4278                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4279                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4280                         }
4281                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4282                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4283                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4284                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4285
4286                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4287                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4288                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4289                         DPSOFTRAST_Vector3Normalize(lightnormal);
4290
4291                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4292                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4293                         {
4294                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4295                                 attenuation *= (1.0f / 255.0f);
4296                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4297                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4298                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4299                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4300                         }
4301                         else
4302                         {
4303                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4304                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4305                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4306                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4307                         }
4308                         buffer_FragColorbgra8[x*4+0] = d[0];
4309                         buffer_FragColorbgra8[x*4+1] = d[1];
4310                         buffer_FragColorbgra8[x*4+2] = d[2];
4311                         buffer_FragColorbgra8[x*4+3] = d[3];
4312                 }
4313         }
4314         else
4315         {
4316                 for (x = startx;x < endx;x++)
4317                 {
4318                         z = buffer_z[x];
4319                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4320                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4321                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4322                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4323                         if (attenuation < 0.01f)
4324                                 continue;
4325                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4326                         {
4327                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4328                                 if (attenuation < 0.01f)
4329                                         continue;
4330                         }
4331
4332                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4333                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4334                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4335                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4336                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4337                         {
4338                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4339                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4340                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4341                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4342                         }
4343                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4344                         {
4345                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4346                                 attenuation *= (1.0f / 255.0f);
4347                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4348                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4349                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4350                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4351                         }
4352                         else
4353                         {
4354                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4355                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4356                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4357                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4358                         }
4359                         buffer_FragColorbgra8[x*4+0] = d[0];
4360                         buffer_FragColorbgra8[x*4+1] = d[1];
4361                         buffer_FragColorbgra8[x*4+2] = d[2];
4362                         buffer_FragColorbgra8[x*4+3] = d[3];
4363                 }
4364         }
4365         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4366 #endif
4367 }
4368
4369
4370
4371 void DPSOFTRAST_VertexShader_Refraction(void)
4372 {
4373         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4374         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4375         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4376 }
4377
4378 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4379 {
4380         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4381
4382         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4383         float z;
4384         int x, startx = span->startx, endx = span->endx;
4385
4386         // texture reads
4387         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4388         //unsigned char buffer_texture_refractionbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4389         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4390
4391         // varyings
4392         float ModelViewProjectionPositiondata[4];
4393         float ModelViewProjectionPositionslope[4];
4394
4395         // uniforms
4396         float ScreenScaleRefractReflect[2];
4397         float ScreenCenterRefractReflect[2];
4398         float DistortScaleRefractReflect[2];
4399         float RefractColor[4];
4400
4401         const unsigned char * RESTRICT pixelbase;
4402         const unsigned char * RESTRICT pixel[4];
4403         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4404         if(!texture) return;
4405         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4406
4407         // read textures
4408         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4409         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4410         //DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_refractionbgra8, GL20TU_REFRACTION, DPSOFTRAST_ARRAY_TEXCOORD1, buffer_z);
4411
4412         // read varyings
4413         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4414
4415         // read uniforms
4416         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4417         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4418         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4419         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4420         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4421         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4422         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4423         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4424         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4425         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4426
4427         // do stuff
4428         for (x = startx;x < endx;x++)
4429         {
4430                 float SafeScreenTexCoord[2];
4431                 float ScreenTexCoord[2];
4432                 float v[3];
4433                 float iw;
4434                 unsigned char c[4];
4435
4436                 z = buffer_z[x];
4437
4438                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4439                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4440         
4441                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4442                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4443                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4444
4445                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4446                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4447                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4448                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4449                 DPSOFTRAST_Vector3Normalize(v);
4450                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4451                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4452
4453                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4454                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4455                 {
4456                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<16) - 32768, ScreenTexCoord[1] * (texture->mipmap[0][3]<<16) - 32678};
4457                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4458                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4459                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4460                         int tci[2] = { tc[0]>>16, tc[1]>>16 };
4461                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4462                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4463                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4464                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4465                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4466                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4467                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4468                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4469                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4470                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4471                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4472                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4473                 }
4474                 else
4475                 {
4476                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2] - 0.5, ScreenTexCoord[1] * texture->mipmap[0][3] - 0.5 };
4477                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4478                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4479                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4480                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4481                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4482                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4483                         c[0] = pixel[0][0];
4484                         c[1] = pixel[0][1];
4485                         c[2] = pixel[0][2];
4486                 }
4487
4488                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4489                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4490                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4491                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4492                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4493         }
4494
4495         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4496 }
4497
4498
4499
4500 void DPSOFTRAST_VertexShader_Water(void)
4501 {
4502         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4503 }
4504
4505
4506 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4507 {
4508         // TODO: IMPLEMENT
4509         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4510         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4511         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4512         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4513         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4514 }
4515
4516
4517
4518 void DPSOFTRAST_VertexShader_ShowDepth(void)
4519 {
4520         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4521 }
4522
4523 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4524 {
4525         // TODO: IMPLEMENT
4526         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4527         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4528         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4529         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4530         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4531 }
4532
4533
4534
4535 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4536 {
4537         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4538 }
4539
4540 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4541 {
4542         // TODO: IMPLEMENT
4543         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4544         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4545         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4546         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4547         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4548 }
4549
4550
4551
4552 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4553 {
4554         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4555 }
4556
4557 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4558 {
4559         // TODO: IMPLEMENT
4560         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4561         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4562         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4563         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4564         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4565 }
4566
4567
4568
4569 typedef struct DPSOFTRAST_ShaderModeInfo_s
4570 {
4571         int lodarrayindex;
4572         void (*Vertex)(void);
4573         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4574         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4575         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4576 }
4577 DPSOFTRAST_ShaderModeInfo;
4578
4579 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4580 {
4581         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4582         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4583         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4584         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4585         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4586         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4587         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4588         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4589         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4590         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4591         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4592         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4593         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4594         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4595         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4596         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4597 };
4598
4599 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4600 {
4601         int i;
4602         int x;
4603         int startx;
4604         int endx;
4605 //      unsigned int c;
4606 //      unsigned int *colorpixel;
4607         unsigned int *depthpixel;
4608         float w;
4609         float wslope;
4610         int depth;
4611         int depthslope;
4612         unsigned int d;
4613         DPSOFTRAST_State_Triangle *triangle;
4614         DPSOFTRAST_State_Span *span;
4615         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4616         for (i = 0; i < thread->numspans; i++)
4617         {
4618                 span = &thread->spans[i];
4619                 triangle = &thread->triangles[span->triangle];
4620                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4621                 {
4622                         wslope = triangle->w[0];
4623                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4624                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4625                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4626                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4627                         startx = span->startx;
4628                         endx = span->endx;
4629                         switch(thread->fb_depthfunc)
4630                         {
4631                         default:
4632                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4633                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4634                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4635                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4636                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4637                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4638                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4639                         }
4640                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4641                         //for (x = startx;x < endx;x++)
4642                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4643                         // if there is no color buffer, skip pixel shader
4644                         while (startx < endx && !pixelmask[startx])
4645                                 startx++;
4646                         while (endx > startx && !pixelmask[endx-1])
4647                                 endx--;
4648                         if (startx >= endx)
4649                                 continue; // no pixels to fill
4650                         span->pixelmask = pixelmask;
4651                         span->startx = startx;
4652                         span->endx = endx;
4653                         // run pixel shader if appropriate
4654                         // do this before running depthmask code, to allow the pixelshader
4655                         // to clear pixelmask values for alpha testing
4656                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4657                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4658                         if (thread->depthmask)
4659                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4660                                         if (pixelmask[x])
4661                                                 depthpixel[x] = d;
4662                 }
4663                 else
4664                 {
4665                         // no depth testing means we're just dealing with color...
4666                         // if there is no color buffer, skip pixel shader
4667                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4668                         {
4669                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4670                                 span->pixelmask = pixelmask;
4671                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4672                         }
4673                 }
4674         }
4675         thread->numspans = 0;
4676 }
4677
4678 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4679
4680 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4681 {
4682 #ifdef SSE2_PRESENT
4683         int cullface = thread->cullface;
4684         int minx, maxx, miny, maxy;
4685         int miny1, maxy1, miny2, maxy2;
4686         __m128i fbmin, fbmax;
4687         __m128 viewportcenter, viewportscale;
4688         int firstvertex = command->firstvertex;
4689         int numvertices = command->numvertices;
4690         int numtriangles = command->numtriangles;
4691         const int *element3i = command->element3i;
4692         const unsigned short *element3s = command->element3s;
4693         int clipped = command->clipped;
4694         int i;
4695         int j;
4696         int k;
4697         int y;
4698         int e[3];
4699         __m128i screeny;
4700         int starty, endy, bandy;
4701         int numpoints;
4702         int clipcase;
4703         float clipdist[4];
4704         __m128 triangleedge1, triangleedge2, trianglenormal;
4705         __m128 clipfrac[3];
4706         __m128 screen[4];
4707         DPSOFTRAST_State_Triangle *triangle;
4708         DPSOFTRAST_Texture *texture;
4709         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4710         miny = thread->fb_scissor[1];
4711         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4712         miny1 = bound(miny, thread->miny1, maxy);
4713         maxy1 = bound(miny, thread->maxy1, maxy);
4714         miny2 = bound(miny, thread->miny2, maxy);
4715         maxy2 = bound(miny, thread->maxy2, maxy);
4716         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4717         {
4718                 if (!ATOMIC_DECREMENT(command->refcount))
4719                 {
4720                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4721                                 MM_FREE(command->arrays);
4722                 }
4723                 return;
4724         }
4725         minx = thread->fb_scissor[0];
4726         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4727         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4728         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4729         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4730         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4731         screen[3] = _mm_setzero_ps();
4732         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4733         for (i = 0;i < numtriangles;i++)
4734         {
4735                 const float *screencoord4f = command->arrays;
4736                 const float *arrays = screencoord4f + numvertices*4;
4737
4738                 // generate the 3 edges of this triangle
4739                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4740                 if (element3s)
4741                 {
4742                         e[0] = element3s[i*3+0] - firstvertex;
4743                         e[1] = element3s[i*3+1] - firstvertex;
4744                         e[2] = element3s[i*3+2] - firstvertex;
4745                 }
4746                 else if (element3i)
4747                 {
4748                         e[0] = element3i[i*3+0] - firstvertex;
4749                         e[1] = element3i[i*3+1] - firstvertex;
4750                         e[2] = element3i[i*3+2] - firstvertex;
4751                 }
4752                 else
4753                 {
4754                         e[0] = i*3+0;
4755                         e[1] = i*3+1;
4756                         e[2] = i*3+2;
4757                 }
4758
4759 #define SKIPBACKFACE \
4760                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4761                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4762                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4763                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4764                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4765                 switch(cullface) \
4766                 { \
4767                 case GL_BACK: \
4768                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4769                                 continue; \
4770                         break; \
4771                 case GL_FRONT: \
4772                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4773                                 continue; \
4774                         break; \
4775                 }
4776
4777 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4778                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4779                         { \
4780                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4781                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4782                         }
4783 #define CLIPPEDVERTEXCOPY(k,p1) \
4784                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4785
4786 #define GENATTRIBCOPY(attrib, p1) \
4787                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4788 #define GENATTRIBLERP(attrib, p1, p2) \
4789                 { \
4790                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4791                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4792                 }
4793 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4794                 switch(clipcase) \
4795                 { \
4796                 default: \
4797                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4798                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4799                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4800                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4801                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4802                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4803                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4804                 }
4805
4806                 if (! clipped)
4807                         goto notclipped;
4808
4809                 // calculate distance from nearplane
4810                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4811                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4812                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4813                 if (clipdist[0] >= 0.0f)
4814                 {
4815                         if (clipdist[1] >= 0.0f)
4816                         {
4817                                 if (clipdist[2] >= 0.0f)
4818                                 {
4819                                 notclipped:
4820                                         // triangle is entirely in front of nearplane
4821                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4822                                         SKIPBACKFACE;
4823                                         numpoints = 3;
4824                                         clipcase = 0;
4825                                 }
4826                                 else
4827                                 {
4828                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4829                                         SKIPBACKFACE;
4830                                         numpoints = 4;
4831                                         clipcase = 1;
4832                                 }
4833                         }
4834                         else
4835                         {
4836                                 if (clipdist[2] >= 0.0f)
4837                                 {
4838                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4839                                         SKIPBACKFACE;
4840                                         numpoints = 4;
4841                                         clipcase = 2;
4842                                 }
4843                                 else
4844                                 {
4845                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4846                                         SKIPBACKFACE;
4847                                         numpoints = 3;
4848                                         clipcase = 3;
4849                                 }
4850                         }
4851                 }
4852                 else if (clipdist[1] >= 0.0f)
4853                 {
4854                         if (clipdist[2] >= 0.0f)
4855                         {
4856                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4857                                 SKIPBACKFACE;
4858                                 numpoints = 4;
4859                                 clipcase = 4;
4860                         }
4861                         else
4862                         {
4863                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4864                                 SKIPBACKFACE;
4865                                 numpoints = 3;
4866                                 clipcase = 5;
4867                         }
4868                 }
4869                 else if (clipdist[2] >= 0.0f)
4870                 {
4871                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4872                         SKIPBACKFACE;
4873                         numpoints = 3;
4874                         clipcase = 6;
4875                 }
4876                 else continue; // triangle is entirely behind nearplane
4877
4878                 {
4879                         // calculate integer y coords for triangle points
4880                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4881                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4882                                         screenmin = _mm_min_epi16(screeni, screenir),
4883                                         screenmax = _mm_max_epi16(screeni, screenir);
4884                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4885                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4886                         screenmin = _mm_max_epi16(screenmin, fbmin);
4887                         screenmax = _mm_min_epi16(screenmax, fbmax);
4888                         // skip offscreen triangles
4889                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4890                                 continue;
4891                         starty = _mm_extract_epi16(screenmin, 1);
4892                         endy = _mm_extract_epi16(screenmax, 1)+1;
4893                         if (starty >= maxy1 && endy <= miny2)
4894                                 continue;
4895                         screeny = _mm_srai_epi32(screeni, 16);
4896                 }
4897
4898                 triangle = &thread->triangles[thread->numtriangles];
4899
4900                 // calculate attribute plans for triangle data...
4901                 // okay, this triangle is going to produce spans, we'd better project
4902                 // the interpolants now (this is what gives perspective texturing),
4903                 // this consists of simply multiplying all arrays by the W coord
4904                 // (which is basically 1/Z), which will be undone per-pixel
4905                 // (multiplying by Z again) to get the perspective-correct array
4906                 // values
4907                 {
4908                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4909                         __m128 mipedgescale, mipdensity;
4910                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4911                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4912                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4913                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4914                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4915                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4916                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4917                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4918                         attribedge1 = _mm_sub_ss(w0, w1);
4919                         attribedge2 = _mm_sub_ss(w2, w1);
4920                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4921                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4922                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4923                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4924                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4925                         _mm_store_ss(&triangle->w[0], attribxslope);
4926                         _mm_store_ss(&triangle->w[1], attribyslope);
4927                         _mm_store_ss(&triangle->w[2], attriborigin);
4928                         mipedgescale = _mm_setzero_ps();
4929                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4930                         {
4931                                 __m128 attrib0, attrib1, attrib2;
4932                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4933                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4934                                         break;
4935                                 arrays += numvertices*4;
4936                                 GENATTRIBS(attrib0, attrib1, attrib2);
4937                                 attriborigin = _mm_mul_ps(attrib1, w1);
4938                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4939                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4940                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4941                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4942                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4943                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4944                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4945                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4946                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4947                                 {
4948                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4949                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4950                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4951                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4952                                 }
4953                         }
4954
4955                         memset(triangle->mip, 0, sizeof(triangle->mip));
4956                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4957                         {
4958                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4959                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4960                                         break;
4961                                 texture = thread->texbound[texunit];
4962                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4963                                 {
4964                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4965                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4966                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4967                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4968                                         // this will be multiplied in the texturing routine by the texture resolution
4969                                         y = _mm_cvtss_si32(mipdensity);
4970                                         if (y > 0)
4971                                         {
4972                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4973                                                 if (y > texture->mipmaps - 1)
4974                                                         y = texture->mipmaps - 1;
4975                                                 triangle->mip[texunit] = y;
4976                                         }
4977                                 }
4978                         }
4979                 }
4980         
4981                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4982                 for (; y < bandy;)
4983                 {
4984                         __m128 xcoords, xslope;
4985                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4986                         int yccmask = _mm_movemask_epi8(ycc);
4987                         int edge0p, edge0n, edge1p, edge1n;
4988                         int nexty;
4989                         if (numpoints == 4)
4990                         {
4991                                 switch(yccmask)
4992                                 {
4993                                 default:
4994                                 case 0xFFFF: /*0000*/ y = endy; continue;
4995                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4996                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4997                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4998                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4999                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5000                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5001                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5002                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5003                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5004                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5005                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5006                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5007                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5008                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5009                                 case 0x0000: /*1111*/ y++; continue;
5010                                 }
5011                         }
5012                         else
5013                         {
5014                                 switch(yccmask)
5015                                 {
5016                                 default:
5017                                 case 0xFFFF: /*000*/ y = endy; continue;
5018                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5019                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5020                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5021                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5022                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5023                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5024                                 case 0x0000: /*111*/ y++; continue;
5025                                 }
5026                         }
5027                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5028                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5029                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5030                         nexty = _mm_extract_epi16(ycc, 0);
5031                         if (nexty >= bandy) nexty = bandy-1;
5032                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5033                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5034                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5035                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5036                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5037                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5038                         {
5039                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5040                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5041                         }
5042                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5043                         {
5044                                 int startx, endx, offset;
5045                                 startx = _mm_cvtss_si32(xcoords);
5046                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5047                                 if (startx < minx) 
5048                                 {
5049                                         if (startx < 0) startx = 0;
5050                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5051                                 }
5052                                 if (endx > maxx) endx = maxx;
5053                                 if (startx >= endx) continue;
5054                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5055                                 {
5056                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5057                                         span->triangle = thread->numtriangles;
5058                                         span->x = offset;
5059                                         span->y = y;
5060                                         span->startx = max(minx - offset, 0);
5061                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5062                                         if (span->startx >= span->endx)
5063                                                 continue; 
5064                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5065                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5066                                 }
5067                         }
5068                 }
5069
5070                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5071                 {
5072                         DPSOFTRAST_Draw_ProcessSpans(thread);
5073                         thread->numtriangles = 0;
5074                 }
5075         }
5076
5077         if (!ATOMIC_DECREMENT(command->refcount))
5078         {
5079                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5080                         MM_FREE(command->arrays);
5081         }
5082
5083         if (thread->numspans > 0 || thread->numtriangles > 0)
5084         {
5085                 DPSOFTRAST_Draw_ProcessSpans(thread);
5086                 thread->numtriangles = 0;
5087         }
5088 #endif
5089 }
5090
5091 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5092 {
5093         int i;
5094         int j;
5095         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5096         int datasize = 2*numvertices*sizeof(float[4]);
5097         DPSOFTRAST_Command_Draw *command;
5098         unsigned char *data;
5099         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5100         {
5101                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5102                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5103                         break;
5104                 datasize += numvertices*sizeof(float[4]);
5105         }
5106         if (element3s)
5107                 datasize += numtriangles*sizeof(unsigned short[3]);
5108         else if (element3i)
5109                 datasize += numtriangles*sizeof(int[3]);
5110         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5111         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5112         {
5113                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5114                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5115         }
5116         else
5117         {
5118                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5119                 data = (unsigned char *)command + commandsize;
5120         }
5121         command->firstvertex = firstvertex;
5122         command->numvertices = numvertices;
5123         command->numtriangles = numtriangles;
5124         command->arrays = (float *)data;
5125         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5126         dpsoftrast.firstvertex = firstvertex;
5127         dpsoftrast.numvertices = numvertices;
5128         dpsoftrast.screencoord4f = (float *)data;
5129         data += numvertices*sizeof(float[4]);
5130         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5131         data += numvertices*sizeof(float[4]);
5132         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5133         {
5134                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5135                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5136                         break;
5137                 dpsoftrast.post_array4f[j] = (float *)data;
5138                 data += numvertices*sizeof(float[4]);
5139         }
5140         command->element3i = NULL;
5141         command->element3s = NULL;
5142         if (element3s)
5143         {
5144                 command->element3s = (unsigned short *)data;
5145                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5146         }
5147         else if (element3i)
5148         {
5149                 command->element3i = (int *)data;
5150                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5151         }
5152         return command;
5153 }
5154
5155 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5156 {
5157         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5158         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5159         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5160         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5161         if (command->starty >= command->endy)
5162         {
5163                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5164                         MM_FREE(command->arrays);
5165                 DPSOFTRAST_UndoCommand(command->commandsize);
5166                 return;
5167         }
5168         command->clipped = dpsoftrast.drawclipped;
5169         command->refcount = dpsoftrast.numthreads;
5170
5171         if (dpsoftrast.usethreads)
5172         {
5173                 int i;
5174                 DPSOFTRAST_Draw_SyncCommands();
5175                 for (i = 0; i < dpsoftrast.numthreads; i++)
5176                 {
5177                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5178                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5179                                 Thread_CondSignal(thread->drawcond);
5180                 }
5181         }
5182         else
5183         {
5184                 DPSOFTRAST_Draw_FlushThreads();
5185         }
5186 }
5187
5188 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5189 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5190 {
5191         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5192 }
5193 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5194 {
5195         DPSOFTRAST_Command_SetRenderTargets *command;
5196         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5197                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5198                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5199                 DPSOFTRAST_Flush();
5200         dpsoftrast.fb_width = width;
5201         dpsoftrast.fb_height = height;
5202         dpsoftrast.fb_depthpixels = depthpixels;
5203         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5204         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5205         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5206         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5207         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5208         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5209         command->width = width;
5210         command->height = height;
5211 }
5212  
5213 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5214 {
5215         int commandoffset = thread->commandoffset;
5216         while (commandoffset != endoffset)
5217         {
5218                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5219                 switch (command->opcode)
5220                 {
5221 #define INTERPCOMMAND(name) \
5222                 case DPSOFTRAST_OPCODE_##name : \
5223                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5224                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5225                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5226                                 commandoffset = 0; \
5227                         break;
5228                 INTERPCOMMAND(Viewport)
5229                 INTERPCOMMAND(ClearColor)
5230                 INTERPCOMMAND(ClearDepth)
5231                 INTERPCOMMAND(ColorMask)
5232                 INTERPCOMMAND(DepthTest)
5233                 INTERPCOMMAND(ScissorTest)
5234                 INTERPCOMMAND(Scissor)
5235                 INTERPCOMMAND(BlendFunc)
5236                 INTERPCOMMAND(BlendSubtract)
5237                 INTERPCOMMAND(DepthMask)
5238                 INTERPCOMMAND(DepthFunc)
5239                 INTERPCOMMAND(DepthRange)
5240                 INTERPCOMMAND(PolygonOffset)
5241                 INTERPCOMMAND(CullFace)
5242                 INTERPCOMMAND(AlphaTest)
5243                 INTERPCOMMAND(AlphaFunc)
5244                 INTERPCOMMAND(SetTexture)
5245                 INTERPCOMMAND(SetShader)
5246                 INTERPCOMMAND(Uniform4f)
5247                 INTERPCOMMAND(UniformMatrix4f)
5248                 INTERPCOMMAND(Uniform1i)
5249                 INTERPCOMMAND(SetRenderTargets)
5250
5251                 case DPSOFTRAST_OPCODE_Draw:
5252                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5253                         commandoffset += command->commandsize;
5254                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5255                                 commandoffset = 0;
5256                         thread->commandoffset = commandoffset;
5257                         break;
5258
5259                 case DPSOFTRAST_OPCODE_Reset:
5260                         commandoffset = 0;
5261                         break;
5262                 }
5263         }
5264         thread->commandoffset = commandoffset;
5265 }
5266
5267 static int DPSOFTRAST_Draw_Thread(void *data)
5268 {
5269         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5270         while(thread->index >= 0)
5271         {
5272                 if (thread->commandoffset != dpsoftrast.drawcommand)
5273                 {
5274                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5275                 }
5276                 else 
5277                 {
5278                         Thread_LockMutex(thread->drawmutex);
5279                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5280                         {
5281                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5282                                 thread->starving = true;
5283                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5284                                 thread->starving = false;
5285                         }
5286                         Thread_UnlockMutex(thread->drawmutex);
5287                 }
5288         }   
5289         return 0;
5290 }
5291
5292 static void DPSOFTRAST_Draw_FlushThreads(void)
5293 {
5294         DPSOFTRAST_State_Thread *thread;
5295         int i;
5296         DPSOFTRAST_Draw_SyncCommands();
5297         if (dpsoftrast.usethreads) 
5298         {
5299                 for (i = 0; i < dpsoftrast.numthreads; i++)
5300                 {
5301                         thread = &dpsoftrast.threads[i];
5302                         if (thread->commandoffset != dpsoftrast.drawcommand)
5303                         {
5304                                 Thread_LockMutex(thread->drawmutex);
5305                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5306                                         Thread_CondSignal(thread->drawcond);
5307                                 Thread_UnlockMutex(thread->drawmutex);
5308                         }
5309                 }
5310                 for (i = 0; i < dpsoftrast.numthreads; i++)
5311                 {
5312                         thread = &dpsoftrast.threads[i];
5313                         if (thread->commandoffset != dpsoftrast.drawcommand)
5314                         {
5315                                 Thread_LockMutex(thread->drawmutex);
5316                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5317                                 {
5318                                         thread->waiting = true;
5319                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5320                                         thread->waiting = false;
5321                                 }
5322                                 Thread_UnlockMutex(thread->drawmutex);
5323                         }
5324                 }
5325         }
5326         else
5327         {
5328                 for (i = 0; i < dpsoftrast.numthreads; i++)
5329                 {
5330                         thread = &dpsoftrast.threads[i];
5331                         if (thread->commandoffset != dpsoftrast.drawcommand)
5332                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5333                 }
5334         }
5335         dpsoftrast.commandpool.usedcommands = 0;
5336 }
5337
5338 void DPSOFTRAST_Flush(void)
5339 {
5340         DPSOFTRAST_Draw_FlushThreads();
5341 }
5342
5343 void DPSOFTRAST_Finish(void)
5344 {
5345         DPSOFTRAST_Flush();
5346 }
5347
5348 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5349 {
5350         int i;
5351         union
5352         {
5353                 int i;
5354                 unsigned char b[4];
5355         }
5356         u;
5357         u.i = 1;
5358         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5359         dpsoftrast.bigendian = u.b[3];
5360         dpsoftrast.fb_width = width;
5361         dpsoftrast.fb_height = height;
5362         dpsoftrast.fb_depthpixels = depthpixels;
5363         dpsoftrast.fb_colorpixels[0] = colorpixels;
5364         dpsoftrast.fb_colorpixels[1] = NULL;
5365         dpsoftrast.fb_colorpixels[1] = NULL;
5366         dpsoftrast.fb_colorpixels[1] = NULL;
5367         dpsoftrast.viewport[0] = 0;
5368         dpsoftrast.viewport[1] = 0;
5369         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5370         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5371         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5372         dpsoftrast.texture_firstfree = 1;
5373         dpsoftrast.texture_end = 1;
5374         dpsoftrast.texture_max = 0;
5375         dpsoftrast.color[0] = 1;
5376         dpsoftrast.color[1] = 1;
5377         dpsoftrast.color[2] = 1;
5378         dpsoftrast.color[3] = 1;
5379         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5380         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5381         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5382         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5383         for (i = 0; i < dpsoftrast.numthreads; i++)
5384         {
5385                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5386                 thread->index = i;
5387                 thread->cullface = GL_BACK;
5388                 thread->colormask[1] = 1;
5389                 thread->colormask[2] = 1;
5390                 thread->colormask[3] = 1;
5391                 thread->blendfunc[0] = GL_ONE;
5392                 thread->blendfunc[1] = GL_ZERO;
5393                 thread->depthmask = true;
5394                 thread->depthtest = true;
5395                 thread->depthfunc = GL_LEQUAL;
5396                 thread->scissortest = false;
5397                 thread->alphatest = false;
5398                 thread->alphafunc = GL_GREATER;
5399                 thread->alphavalue = 0.5f;
5400                 thread->viewport[0] = 0;
5401                 thread->viewport[1] = 0;
5402                 thread->viewport[2] = dpsoftrast.fb_width;
5403                 thread->viewport[3] = dpsoftrast.fb_height;
5404                 thread->scissor[0] = 0;
5405                 thread->scissor[1] = 0;
5406                 thread->scissor[2] = dpsoftrast.fb_width;
5407                 thread->scissor[3] = dpsoftrast.fb_height;
5408                 thread->depthrange[0] = 0;
5409                 thread->depthrange[1] = 1;
5410                 thread->polygonoffset[0] = 0;
5411                 thread->polygonoffset[1] = 0;
5412         
5413                 DPSOFTRAST_RecalcThread(thread);
5414         
5415                 thread->numspans = 0;
5416                 thread->numtriangles = 0;
5417                 thread->commandoffset = 0;
5418                 thread->waiting = false;
5419                 thread->starving = false;
5420            
5421                 thread->validate = -1;
5422                 DPSOFTRAST_Validate(thread, -1);
5423  
5424                 if (dpsoftrast.usethreads)
5425                 {
5426                         thread->waitcond = Thread_CreateCond();
5427                         thread->drawcond = Thread_CreateCond();
5428                         thread->drawmutex = Thread_CreateMutex();
5429                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5430                 }
5431         }
5432         return 0;
5433 }
5434
5435 void DPSOFTRAST_Shutdown(void)
5436 {
5437         int i;
5438         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5439         {
5440                 DPSOFTRAST_State_Thread *thread;
5441                 for (i = 0; i < dpsoftrast.numthreads; i++)
5442                 {
5443                         thread = &dpsoftrast.threads[i];
5444                         Thread_LockMutex(thread->drawmutex);
5445                         thread->index = -1;
5446                         Thread_CondSignal(thread->drawcond);
5447                         Thread_UnlockMutex(thread->drawmutex);
5448                         Thread_WaitThread(thread->thread, 0);
5449                         Thread_DestroyCond(thread->waitcond);
5450                         Thread_DestroyCond(thread->drawcond);
5451                         Thread_DestroyMutex(thread->drawmutex);
5452                 }
5453         }
5454         for (i = 0;i < dpsoftrast.texture_end;i++)
5455                 if (dpsoftrast.texture[i].bytes)
5456                         MM_FREE(dpsoftrast.texture[i].bytes);
5457         if (dpsoftrast.texture)
5458                 free(dpsoftrast.texture);
5459         if (dpsoftrast.threads)
5460                 MM_FREE(dpsoftrast.threads);
5461         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5462 }
5463