]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
8edf425769a0601d926e1eea1e26ba33ad8cb47b
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__APPLE__)
18                 #include <libkern/OSAtomic.h>
19                 #define ALIGN(var) var __attribute__((__aligned__(16)))
20                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21                 #define MEMORY_BARRIER (_mm_sfence())
22                 #define ATOMIC_COUNTER volatile int32_t 
23                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26         #elif defined(__GNUC__)
27                 #define ALIGN(var) var __attribute__((__aligned__(16)))
28                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(__sync_synchronize())
31                 #define ATOMIC_COUNTER volatile int
32                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35         #elif defined(_MSC_VER)
36                 #define ALIGN(var) __declspec(align(16)) var
37                 #define ATOMIC(var) __declspec(align(32)) var
38                 #define MEMORY_BARRIER (_mm_sfence())
39                 //(MemoryBarrier())
40                 #define ATOMIC_COUNTER volatile LONG
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
44         #endif
45 #endif
46
47 #ifndef ALIGN
48 #define ALIGN(var) var
49 #endif
50 #ifndef ATOMIC
51 #define ATOMIC(var) var
52 #endif
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
55 #endif
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
58 #endif
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
61 #endif
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
64 #endif
65 #ifndef ATOMIC_ADD
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
67 #endif
68
69 #ifdef SSE2_PRESENT
70 #include <emmintrin.h>
71
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
73
74 static void *MM_CALLOC(size_t nmemb, size_t size)
75 {
76         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77         if (ptr != NULL) memset(ptr, 0, nmemb*size);
78         return ptr;
79 }
80
81 #define MM_FREE _mm_free
82 #else
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
85 #define MM_FREE free
86 #endif
87
88 typedef enum DPSOFTRAST_ARRAY_e
89 {
90         DPSOFTRAST_ARRAY_POSITION,
91         DPSOFTRAST_ARRAY_COLOR,
92         DPSOFTRAST_ARRAY_TEXCOORD0,
93         DPSOFTRAST_ARRAY_TEXCOORD1,
94         DPSOFTRAST_ARRAY_TEXCOORD2,
95         DPSOFTRAST_ARRAY_TEXCOORD3,
96         DPSOFTRAST_ARRAY_TEXCOORD4,
97         DPSOFTRAST_ARRAY_TEXCOORD5,
98         DPSOFTRAST_ARRAY_TEXCOORD6,
99         DPSOFTRAST_ARRAY_TEXCOORD7,
100         DPSOFTRAST_ARRAY_TOTAL
101 }
102 DPSOFTRAST_ARRAY;
103
104 typedef struct DPSOFTRAST_Texture_s
105 {
106         int flags;
107         int width;
108         int height;
109         int depth;
110         int sides;
111         DPSOFTRAST_TEXTURE_FILTER filter;
112         int mipmaps;
113         int size;
114         ATOMIC_COUNTER binds;
115         unsigned char *bytes;
116         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
117 }
118 DPSOFTRAST_Texture;
119
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
122
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
124 {
125         unsigned char opcode;
126         unsigned short commandsize;
127 }
128 DPSOFTRAST_Command);
129
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
131
132 #define DEFCOMMAND(opcodeval, name, fields) \
133         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
135         { \
136                 unsigned char opcode; \
137                 unsigned short commandsize; \
138                 fields \
139         } DPSOFTRAST_Command_##name );
140
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
143
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
145 {
146         int freecommand;
147         int usedcommands;
148         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
149 }
150 DPSOFTRAST_State_Command_Pool);
151
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
153 {
154         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
155         float w[3];
156         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
157 }
158 DPSOFTRAST_State_Triangle);
159
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
165 }
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 }
176                                         
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
178
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
180 {
181         int triangle; // triangle this span was generated by
182         int x; // framebuffer x coord
183         int y; // framebuffer y coord
184         int startx; // usable range (according to pixelmask)
185         int endx; // usable range (according to pixelmask)
186         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
187 }
188 DPSOFTRAST_State_Span);
189
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
192
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
197
198 typedef enum DPSOFTRAST_BLENDMODE_e
199 {
200         DPSOFTRAST_BLENDMODE_OPAQUE,
201         DPSOFTRAST_BLENDMODE_ALPHA,
202         DPSOFTRAST_BLENDMODE_ADDALPHA,
203         DPSOFTRAST_BLENDMODE_ADD,
204         DPSOFTRAST_BLENDMODE_INVMOD,
205         DPSOFTRAST_BLENDMODE_MUL,
206         DPSOFTRAST_BLENDMODE_MUL2,
207         DPSOFTRAST_BLENDMODE_SUBALPHA,
208         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209         DPSOFTRAST_BLENDMODE_INVADD,
210         DPSOFTRAST_BLENDMODE_TOTAL
211 }
212 DPSOFTRAST_BLENDMODE;
213
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
215 {
216         void *thread;
217         int index;
218         
219         int cullface;
220         int colormask[4];
221         int blendfunc[2];
222         int blendsubtract;
223         int depthmask;
224         int depthtest;
225         int depthfunc;
226         int scissortest;
227         int alphatest;
228         int alphafunc;
229         float alphavalue;
230         int viewport[4];
231         int scissor[4];
232         float depthrange[2];
233         float polygonoffset[2];
234
235         int shader_mode;
236         int shader_permutation;
237         int shader_exactspecularmath;
238
239         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
240         
241         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
242         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
243
244         // DPSOFTRAST_VALIDATE_ flags
245         int validate;
246
247         // derived values (DPSOFTRAST_VALIDATE_FB)
248         int fb_colormask;
249         int fb_scissor[4];
250         ALIGN(float fb_viewportcenter[4]);
251         ALIGN(float fb_viewportscale[4]);
252
253         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
254         int fb_depthfunc;
255
256         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
257         int fb_blendmode;
258
259         // band boundaries
260         int miny1;
261         int maxy1;
262         int miny2;
263         int maxy2;
264
265         ATOMIC(volatile int commandoffset);
266
267         volatile bool waiting;
268         volatile bool starving;
269         void *waitcond;
270         void *drawcond;
271         void *drawmutex;
272
273         int numspans;
274         int numtriangles;
275         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
276         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
277 }
278 DPSOFTRAST_State_Thread);
279
280 typedef ATOMIC(struct DPSOFTRAST_State_s
281 {
282         int fb_width;
283         int fb_height;
284         unsigned int *fb_depthpixels;
285         unsigned int *fb_colorpixels[4];
286
287         int viewport[4];
288         ALIGN(float fb_viewportcenter[4]);
289         ALIGN(float fb_viewportscale[4]);
290
291         float color[4];
292         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
293         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
294
295         const float *pointer_vertex3f;
296         const float *pointer_color4f;
297         const unsigned char *pointer_color4ub;
298         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
299         int stride_vertex;
300         int stride_color;
301         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
303         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
304
305         int firstvertex;
306         int numvertices;
307         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
308         float *screencoord4f;
309         int drawstarty;
310         int drawendy;
311         int drawclipped;
312         
313         int shader_mode;
314         int shader_permutation;
315         int shader_exactspecularmath;
316
317         int texture_max;
318         int texture_end;
319         int texture_firstfree;
320         DPSOFTRAST_Texture *texture;
321
322         int bigendian;
323
324         // error reporting
325         const char *errorstring;
326
327         bool usethreads;
328         int interlace;
329         int numthreads;
330         DPSOFTRAST_State_Thread *threads;
331
332         ATOMIC(volatile int drawcommand);
333
334         DPSOFTRAST_State_Command_Pool commandpool;
335 }
336 DPSOFTRAST_State);
337
338 DPSOFTRAST_State dpsoftrast;
339
340 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
341 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
342 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
343 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
344 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
345
346 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
347 {
348         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
349         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
350         fb_viewportcenter[3] = 0.5f;
351         fb_viewportcenter[0] = 0.0f;
352         fb_viewportscale[1] = 0.5f * viewport[2];
353         fb_viewportscale[2] = -0.5f * viewport[3];
354         fb_viewportscale[3] = 0.5f;
355         fb_viewportscale[0] = 1.0f;
356 }
357
358 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
359 {
360         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
361         // and viewport projection values
362         int x1, x2;
363         int y1, y2;
364         x1 = thread->scissor[0];
365         x2 = thread->scissor[0] + thread->scissor[2];
366         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
367         y2 = dpsoftrast.fb_height - thread->scissor[1];
368         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
369         if (x1 < 0) x1 = 0;
370         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
371         if (y1 < 0) y1 = 0;
372         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
373         thread->fb_scissor[0] = x1;
374         thread->fb_scissor[1] = y1;
375         thread->fb_scissor[2] = x2 - x1;
376         thread->fb_scissor[3] = y2 - y1;
377
378         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
379 }
380
381 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
382 {
383         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
384 }
385
386 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
387 {
388         if (thread->blendsubtract)
389         {
390                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
391                 {
392                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
393                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
394                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
395                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
396                 }
397         }
398         else
399         {       
400                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
401                 {
402                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
403                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
404                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
405                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
406                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
407                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
408                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
409                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
410                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
411                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
412                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
413                 }
414         }
415 }
416
417 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
418
419 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
420 {
421         mask &= thread->validate;
422         if (!mask)
423                 return;
424         if (mask & DPSOFTRAST_VALIDATE_FB)
425         {
426                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
427                 DPSOFTRAST_RecalcFB(thread);
428         }
429         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
430         {
431                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
432                 DPSOFTRAST_RecalcDepthFunc(thread);
433         }
434         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
435         {
436                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
437                 DPSOFTRAST_RecalcBlendFunc(thread);
438         }
439 }
440
441 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
442 {
443         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
444                 return &dpsoftrast.texture[index];
445         return NULL;
446 }
447
448 static void DPSOFTRAST_Texture_Grow(void)
449 {
450         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
451         DPSOFTRAST_State_Thread *thread;
452         int i;
453         int j;
454         DPSOFTRAST_Flush();
455         // expand texture array as needed
456         if (dpsoftrast.texture_max < 1024)
457                 dpsoftrast.texture_max = 1024;
458         else
459                 dpsoftrast.texture_max *= 2;
460         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
461         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
462                 if (dpsoftrast.texbound[i])
463                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
464         for (j = 0; j < dpsoftrast.numthreads; j++)
465         {
466                 thread = &dpsoftrast.threads[j];
467                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
468                         if (thread->texbound[i])
469                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
470         }
471 }
472
473 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
474 {
475         int w;
476         int h;
477         int d;
478         int size;
479         int s;
480         int texnum;
481         int mipmaps;
482         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
483         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
484         DPSOFTRAST_Texture *texture;
485         if (width*height*depth < 1)
486         {
487                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
488                 return 0;
489         }
490         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
491         {
492                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
493                 return 0;
494         }
495         switch(texformat)
496         {
497         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
498         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
499         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
500                 break;
501         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
502                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
503                 {
504                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
505                         return 0;
506                 }
507                 if (depth != 1)
508                 {
509                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
510                         return 0;
511                 }
512                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
513                 {
514                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
515                         return 0;
516                 }
517                 break;
518         }
519         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
520         {
521                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
522                 return 0;
523         }
524         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
525         {
526                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
527                 return 0;
528         }
529         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
530         {
531                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
532                 return 0;
533         }
534         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
535         {
536                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
537                 return 0;
538         }
539         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
542                 return 0;
543         }
544         // find first empty slot in texture array
545         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
546                 if (!dpsoftrast.texture[texnum].bytes)
547                         break;
548         dpsoftrast.texture_firstfree = texnum + 1;
549         if (dpsoftrast.texture_max <= texnum)
550                 DPSOFTRAST_Texture_Grow();
551         if (dpsoftrast.texture_end <= texnum)
552                 dpsoftrast.texture_end = texnum + 1;
553         texture = &dpsoftrast.texture[texnum];
554         memset(texture, 0, sizeof(*texture));
555         texture->flags = flags;
556         texture->width = width;
557         texture->height = height;
558         texture->depth = depth;
559         texture->sides = sides;
560         texture->binds = 0;
561         w = width;
562         h = height;
563         d = depth;
564         size = 0;
565         mipmaps = 0;
566         w = width;
567         h = height;
568         d = depth;
569         for (;;)
570         {
571                 s = w * h * d * sides * 4;
572                 texture->mipmap[mipmaps][0] = size;
573                 texture->mipmap[mipmaps][1] = s;
574                 texture->mipmap[mipmaps][2] = w;
575                 texture->mipmap[mipmaps][3] = h;
576                 texture->mipmap[mipmaps][4] = d;
577                 size += s;
578                 mipmaps++;
579                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580                         break;
581                 if (w > 1) w >>= 1;
582                 if (h > 1) h >>= 1;
583                 if (d > 1) d >>= 1;
584         }
585         texture->mipmaps = mipmaps;
586         texture->size = size;
587
588         // allocate the pixels now
589         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
590
591         return texnum;
592 }
593 void DPSOFTRAST_Texture_Free(int index)
594 {
595         DPSOFTRAST_Texture *texture;
596         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
597         if (texture->binds)
598                 DPSOFTRAST_Flush();
599         if (texture->bytes)
600                 MM_FREE(texture->bytes);
601         texture->bytes = NULL;
602         memset(texture, 0, sizeof(*texture));
603         // adjust the free range and used range
604         if (dpsoftrast.texture_firstfree > index)
605                 dpsoftrast.texture_firstfree = index;
606         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
607                 dpsoftrast.texture_end--;
608 }
609 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
610 {
611         int i, x, y, z, w, layer0, layer1, row0, row1;
612         unsigned char *o, *i0, *i1, *i2, *i3;
613         DPSOFTRAST_Texture *texture;
614         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
615         if (texture->mipmaps <= 1)
616                 return;
617         for (i = 1;i < texture->mipmaps;i++)
618         {
619                 for (z = 0;z < texture->mipmap[i][4];z++)
620                 {
621                         layer0 = z*2;
622                         layer1 = z*2+1;
623                         if (layer1 >= texture->mipmap[i-1][4])
624                                 layer1 = texture->mipmap[i-1][4]-1;
625                         for (y = 0;y < texture->mipmap[i][3];y++)
626                         {
627                                 row0 = y*2;
628                                 row1 = y*2+1;
629                                 if (row1 >= texture->mipmap[i-1][3])
630                                         row1 = texture->mipmap[i-1][3]-1;
631                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
632                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
633                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
634                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
635                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
636                                 w = texture->mipmap[i][2];
637                                 if (layer1 > layer0)
638                                 {
639                                         if (texture->mipmap[i-1][2] > 1)
640                                         {
641                                                 // average 3D texture
642                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
643                                                 {
644                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
645                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
646                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
647                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
648                                                 }
649                                         }
650                                         else
651                                         {
652                                                 // average 3D mipmap with parent width == 1
653                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
654                                                 {
655                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
656                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
657                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
658                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
659                                                 }
660                                         }
661                                 }
662                                 else
663                                 {
664                                         if (texture->mipmap[i-1][2] > 1)
665                                         {
666                                                 // average 2D texture (common case)
667                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
668                                                 {
669                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
670                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
671                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
672                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
673                                                 }
674                                         }
675                                         else
676                                         {
677                                                 // 2D texture with parent width == 1
678                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
679                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
680                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
681                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
682                                         }
683                                 }
684                         }
685                 }
686         }
687 }
688 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
689 {
690         DPSOFTRAST_Texture *texture;
691         unsigned char *dst;
692         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
693         if (texture->binds)
694                 DPSOFTRAST_Flush();
695         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
696         while (blockheight > 0)
697         {
698                 memcpy(dst, pixels, blockwidth * 4);
699                 pixels += blockwidth * 4;
700                 dst += texture->mipmap[0][2] * 4;
701                 blockheight--;
702         }
703         DPSOFTRAST_Texture_CalculateMipmaps(index);
704 }
705 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
706 {
707         DPSOFTRAST_Texture *texture;
708         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
709         if (texture->binds)
710                 DPSOFTRAST_Flush();
711         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
712         DPSOFTRAST_Texture_CalculateMipmaps(index);
713 }
714 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
715 {
716         DPSOFTRAST_Texture *texture;
717         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
718         return texture->mipmap[mip][2];
719 }
720 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
721 {
722         DPSOFTRAST_Texture *texture;
723         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
724         return texture->mipmap[mip][3];
725 }
726 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
727 {
728         DPSOFTRAST_Texture *texture;
729         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
730         return texture->mipmap[mip][4];
731 }
732 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
733 {
734         DPSOFTRAST_Texture *texture;
735         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
736         if (texture->binds)
737                 DPSOFTRAST_Flush();
738         return texture->bytes + texture->mipmap[mip][0];
739 }
740 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
741 {
742         DPSOFTRAST_Texture *texture;
743         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
744         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
745         {
746                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
747                 return;
748         }
749         if (texture->binds)
750                 DPSOFTRAST_Flush();
751         texture->filter = filter;
752 }
753
754 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
755 {
756         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
757                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
758                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
759                 DPSOFTRAST_Flush();
760         dpsoftrast.fb_width = width;
761         dpsoftrast.fb_height = height;
762         dpsoftrast.fb_depthpixels = depthpixels;
763         dpsoftrast.fb_colorpixels[0] = colorpixels0;
764         dpsoftrast.fb_colorpixels[1] = colorpixels1;
765         dpsoftrast.fb_colorpixels[2] = colorpixels2;
766         dpsoftrast.fb_colorpixels[3] = colorpixels3;
767 }
768
769 static void DPSOFTRAST_Draw_FlushThreads(void);
770
771 static void DPSOFTRAST_Draw_SyncCommands(void)
772 {
773         if(dpsoftrast.usethreads) MEMORY_BARRIER;
774         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
775 }
776
777 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
778 {
779         DPSOFTRAST_State_Thread *thread;
780         int i;
781         int freecommand = dpsoftrast.commandpool.freecommand;
782         int usedcommands = dpsoftrast.commandpool.usedcommands;
783         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
784                 return;
785         DPSOFTRAST_Draw_SyncCommands();
786         for(;;)
787         {
788                 int waitindex = -1;
789                 int commandoffset;
790                 usedcommands = 0;
791                 for (i = 0; i < dpsoftrast.numthreads; i++)
792                 {
793                         thread = &dpsoftrast.threads[i]; 
794                         commandoffset = freecommand - thread->commandoffset;
795                         if (commandoffset < 0)
796                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
797                         if (commandoffset > usedcommands)
798                         {
799                                 waitindex = i;
800                                 usedcommands = commandoffset;
801                         }
802                 }
803                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
804                         break;
805                 thread = &dpsoftrast.threads[waitindex];
806                 Thread_LockMutex(thread->drawmutex);
807                 if (thread->commandoffset != dpsoftrast.drawcommand)
808                 {
809                         thread->waiting = true;
810                         if (thread->starving) Thread_CondSignal(thread->drawcond);
811                         Thread_CondWait(thread->waitcond, thread->drawmutex);
812                         thread->waiting = false;
813                 }
814                 Thread_UnlockMutex(thread->drawmutex);
815         }
816         dpsoftrast.commandpool.usedcommands = usedcommands;
817 }
818
819 #define DPSOFTRAST_ALIGNCOMMAND(size) \
820         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
821 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
822         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
823
824 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
825 {
826         DPSOFTRAST_Command *command;
827         int freecommand = dpsoftrast.commandpool.freecommand;
828         int usedcommands = dpsoftrast.commandpool.usedcommands;
829         int extra = sizeof(DPSOFTRAST_Command);
830         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
831                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
832         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
833         {
834                 if (dpsoftrast.usethreads)
835                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
836                 else
837                         DPSOFTRAST_Draw_FlushThreads();
838                 freecommand = dpsoftrast.commandpool.freecommand;
839                 usedcommands = dpsoftrast.commandpool.usedcommands;
840         }
841         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
842         {
843                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
844                 command->opcode = DPSOFTRAST_OPCODE_Reset;
845                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
846                 freecommand = 0;
847         }
848         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
849         command->opcode = opcode;
850         command->commandsize = size;
851         freecommand += size;
852         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
853                 freecommand = 0;
854         dpsoftrast.commandpool.freecommand = freecommand;
855         dpsoftrast.commandpool.usedcommands = usedcommands + size;
856         return command;
857 }
858
859 static void DPSOFTRAST_UndoCommand(int size)
860 {
861         int freecommand = dpsoftrast.commandpool.freecommand;
862         int usedcommands = dpsoftrast.commandpool.usedcommands;
863         freecommand -= size;
864         if (freecommand < 0)
865                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
866         usedcommands -= size;
867         dpsoftrast.commandpool.freecommand = freecommand;
868         dpsoftrast.commandpool.usedcommands = usedcommands;
869 }
870                 
871 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
872 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
873 {
874         thread->viewport[0] = command->x;
875         thread->viewport[1] = command->y;
876         thread->viewport[2] = command->width;
877         thread->viewport[3] = command->height;
878         thread->validate |= DPSOFTRAST_VALIDATE_FB;
879 }
880 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
881 {
882         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
883         command->x = x;
884         command->y = y;
885         command->width = width;
886         command->height = height;
887
888         dpsoftrast.viewport[0] = x;
889         dpsoftrast.viewport[1] = y;
890         dpsoftrast.viewport[2] = width;
891         dpsoftrast.viewport[3] = height;
892         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
893 }
894
895 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
896 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
897 {
898         int i, x1, y1, x2, y2, w, h, x, y;
899         int miny1 = thread->miny1;
900         int maxy1 = thread->maxy1;
901         int miny2 = thread->miny2;
902         int maxy2 = thread->maxy2;
903         int bandy;
904         unsigned int *p;
905         unsigned int c;
906         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
907         x1 = thread->fb_scissor[0];
908         y1 = thread->fb_scissor[1];
909         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
910         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
911         if (y1 < miny1) y1 = miny1;
912         if (y2 > maxy2) y2 = maxy2;
913         w = x2 - x1;
914         h = y2 - y1;
915         if (w < 1 || h < 1)
916                 return;
917         // FIXME: honor fb_colormask?
918         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
919         for (i = 0;i < 4;i++)
920         {
921                 if (!dpsoftrast.fb_colorpixels[i])
922                         continue;
923                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
924                 for (;y < bandy;y++)
925                 {
926                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
927                         for (x = x1;x < x2;x++)
928                                 p[x] = c;
929                 }
930         }
931 }
932 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
933 {
934         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
935         command->r = r;
936         command->g = g;
937         command->b = b;
938         command->a = a;
939 }
940
941 DEFCOMMAND(3, ClearDepth, float depth;)
942 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
943 {
944         int x1, y1, x2, y2, w, h, x, y;
945         int miny1 = thread->miny1;
946         int maxy1 = thread->maxy1;
947         int miny2 = thread->miny2;
948         int maxy2 = thread->maxy2;
949         int bandy;
950         unsigned int *p;
951         unsigned int c;
952         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
953         x1 = thread->fb_scissor[0];
954         y1 = thread->fb_scissor[1];
955         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
956         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
957         if (y1 < miny1) y1 = miny1;
958         if (y2 > maxy2) y2 = maxy2;
959         w = x2 - x1;
960         h = y2 - y1;
961         if (w < 1 || h < 1)
962                 return;
963         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
964         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
965         for (;y < bandy;y++)
966         {
967                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
968                 for (x = x1;x < x2;x++)
969                         p[x] = c;
970         }
971 }
972 void DPSOFTRAST_ClearDepth(float d)
973 {
974         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
975         command->depth = d;
976 }
977
978 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
979 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
980 {
981         thread->colormask[0] = command->r != 0;
982         thread->colormask[1] = command->g != 0;
983         thread->colormask[2] = command->b != 0;
984         thread->colormask[3] = command->a != 0;
985         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
986 }
987 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
988 {
989         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
990         command->r = r;
991         command->g = g;
992         command->b = b;
993         command->a = a;
994 }
995
996 DEFCOMMAND(5, DepthTest, int enable;)
997 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
998 {
999         thread->depthtest = command->enable;
1000         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1001 }
1002 void DPSOFTRAST_DepthTest(int enable)
1003 {
1004         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1005         command->enable = enable;
1006 }
1007
1008 DEFCOMMAND(6, ScissorTest, int enable;)
1009 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1010 {
1011         thread->scissortest = command->enable;
1012         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1013 }
1014 void DPSOFTRAST_ScissorTest(int enable)
1015 {
1016         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1017         command->enable = enable;
1018 }
1019
1020 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1021 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1022 {
1023         thread->scissor[0] = command->x;
1024         thread->scissor[1] = command->y;
1025         thread->scissor[2] = command->width;
1026         thread->scissor[3] = command->height;
1027         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1028 }
1029 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1030 {
1031         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1032         command->x = x;
1033         command->y = y;
1034         command->width = width;
1035         command->height = height;
1036 }
1037
1038 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1039 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1040 {
1041         thread->blendfunc[0] = command->sfactor;
1042         thread->blendfunc[1] = command->dfactor;
1043         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1044 }
1045 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1046 {
1047         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1048         command->sfactor = sfactor;
1049         command->dfactor = dfactor;
1050 }
1051
1052 DEFCOMMAND(9, BlendSubtract, int enable;)
1053 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1054 {
1055         thread->blendsubtract = command->enable;
1056         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1057 }
1058 void DPSOFTRAST_BlendSubtract(int enable)
1059 {
1060         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1061         command->enable = enable;
1062 }
1063
1064 DEFCOMMAND(10, DepthMask, int enable;)
1065 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1066 {
1067         thread->depthmask = command->enable;
1068 }
1069 void DPSOFTRAST_DepthMask(int enable)
1070 {
1071         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1072         command->enable = enable;
1073 }
1074
1075 DEFCOMMAND(11, DepthFunc, int func;)
1076 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1077 {
1078         thread->depthfunc = command->func;
1079 }
1080 void DPSOFTRAST_DepthFunc(int func)
1081 {
1082         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1083         command->func = func;
1084 }
1085
1086 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1087 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1088 {
1089         thread->depthrange[0] = command->nearval;
1090         thread->depthrange[1] = command->farval;
1091 }
1092 void DPSOFTRAST_DepthRange(float nearval, float farval)
1093 {
1094         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1095         command->nearval = nearval;
1096         command->farval = farval;
1097 }
1098
1099 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1100 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1101 {
1102         thread->polygonoffset[0] = command->alongnormal;
1103         thread->polygonoffset[1] = command->intoview;
1104 }
1105 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1106 {
1107         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1108         command->alongnormal = alongnormal;
1109         command->intoview = intoview;
1110 }
1111
1112 DEFCOMMAND(14, CullFace, int mode;)
1113 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1114 {
1115         thread->cullface = command->mode;
1116 }
1117 void DPSOFTRAST_CullFace(int mode)
1118 {
1119         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1120         command->mode = mode;
1121 }
1122
1123 DEFCOMMAND(15, AlphaTest, int enable;)
1124 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1125 {
1126         thread->alphatest = command->enable;
1127 }
1128 void DPSOFTRAST_AlphaTest(int enable)
1129 {
1130         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1131         command->enable = enable;
1132 }
1133
1134 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1135 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1136 {
1137         thread->alphafunc = command->func;
1138         thread->alphavalue = command->ref;
1139 }
1140 void DPSOFTRAST_AlphaFunc(int func, float ref)
1141 {
1142         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1143         command->func = func;
1144         command->ref = ref;
1145 }
1146
1147 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1148 {
1149         dpsoftrast.color[0] = r;
1150         dpsoftrast.color[1] = g;
1151         dpsoftrast.color[2] = b;
1152         dpsoftrast.color[3] = a;
1153 }
1154
1155 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1156 {
1157         int outstride = blockwidth * 4;
1158         int instride = dpsoftrast.fb_width * 4;
1159         int bx1 = blockx;
1160         int by1 = blocky;
1161         int bx2 = blockx + blockwidth;
1162         int by2 = blocky + blockheight;
1163         int bw;
1164         int x;
1165         int y;
1166         unsigned char *inpixels;
1167         unsigned char *b;
1168         unsigned char *o;
1169         DPSOFTRAST_Flush();
1170         if (bx1 < 0) bx1 = 0;
1171         if (by1 < 0) by1 = 0;
1172         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1173         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1174         bw = bx2 - bx1;
1175         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1176         if (dpsoftrast.bigendian)
1177         {
1178                 for (y = by1;y < by2;y++)
1179                 {
1180                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1181                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1182                         for (x = bx1;x < bx2;x++)
1183                         {
1184                                 o[0] = b[3];
1185                                 o[1] = b[2];
1186                                 o[2] = b[1];
1187                                 o[3] = b[0];
1188                                 o += 4;
1189                                 b += 4;
1190                         }
1191                 }
1192         }
1193         else
1194         {
1195                 for (y = by1;y < by2;y++)
1196                 {
1197                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1198                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1199                         memcpy(o, b, bw*4);
1200                 }
1201         }
1202
1203 }
1204 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1205 {
1206         int tx1 = tx;
1207         int ty1 = ty;
1208         int tx2 = tx + width;
1209         int ty2 = ty + height;
1210         int sx1 = sx;
1211         int sy1 = sy;
1212         int sx2 = sx + width;
1213         int sy2 = sy + height;
1214         int swidth;
1215         int sheight;
1216         int twidth;
1217         int theight;
1218         int sw;
1219         int sh;
1220         int tw;
1221         int th;
1222         int y;
1223         unsigned int *spixels;
1224         unsigned int *tpixels;
1225         DPSOFTRAST_Texture *texture;
1226         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1227         if (mip < 0 || mip >= texture->mipmaps) return;
1228         DPSOFTRAST_Flush();
1229         spixels = dpsoftrast.fb_colorpixels[0];
1230         swidth = dpsoftrast.fb_width;
1231         sheight = dpsoftrast.fb_height;
1232         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1233         twidth = texture->mipmap[mip][2];
1234         theight = texture->mipmap[mip][3];
1235         if (tx1 < 0) tx1 = 0;
1236         if (ty1 < 0) ty1 = 0;
1237         if (tx2 > twidth) tx2 = twidth;
1238         if (ty2 > theight) ty2 = theight;
1239         if (sx1 < 0) sx1 = 0;
1240         if (sy1 < 0) sy1 = 0;
1241         if (sx2 > swidth) sx2 = swidth;
1242         if (sy2 > sheight) sy2 = sheight;
1243         tw = tx2 - tx1;
1244         th = ty2 - ty1;
1245         sw = sx2 - sx1;
1246         sh = sy2 - sy1;
1247         if (tw > sw) tw = sw;
1248         if (th > sh) th = sh;
1249         if (tw < 1 || th < 1)
1250                 return;
1251         sy1 = sheight - 1 - sy1;
1252         for (y = 0;y < th;y++)
1253                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1254         if (texture->mipmaps > 1)
1255                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1256 }
1257
1258 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1259 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1260 {
1261         if (thread->texbound[command->unitnum])
1262                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1263         thread->texbound[command->unitnum] = command->texture;
1264 }
1265 void DPSOFTRAST_SetTexture(int unitnum, int index)
1266 {
1267         DPSOFTRAST_Command_SetTexture *command;
1268         DPSOFTRAST_Texture *texture;
1269         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1270         {
1271                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1272                 return;
1273         }
1274         texture = DPSOFTRAST_Texture_GetByIndex(index);
1275         if (index && !texture)
1276         {
1277                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1278                 return;
1279         }
1280
1281         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1282         command->unitnum = unitnum;
1283         command->texture = texture;
1284
1285         dpsoftrast.texbound[unitnum] = texture;
1286         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1287 }
1288
1289 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1290 {
1291         dpsoftrast.pointer_vertex3f = vertex3f;
1292         dpsoftrast.stride_vertex = stride;
1293 }
1294 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1295 {
1296         dpsoftrast.pointer_color4f = color4f;
1297         dpsoftrast.pointer_color4ub = NULL;
1298         dpsoftrast.stride_color = stride;
1299 }
1300 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1301 {
1302         dpsoftrast.pointer_color4f = NULL;
1303         dpsoftrast.pointer_color4ub = color4ub;
1304         dpsoftrast.stride_color = stride;
1305 }
1306 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1307 {
1308         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1309         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1310         dpsoftrast.stride_texcoord[unitnum] = stride;
1311 }
1312
1313 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1314 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1315 {
1316         thread->shader_mode = command->mode;
1317         thread->shader_permutation = command->permutation;
1318         thread->shader_exactspecularmath = command->exactspecularmath;
1319 }
1320 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1321 {
1322         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1323         command->mode = mode;
1324         command->permutation = permutation;
1325         command->exactspecularmath = exactspecularmath;
1326
1327         dpsoftrast.shader_mode = mode;
1328         dpsoftrast.shader_permutation = permutation;
1329         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1330 }
1331
1332 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1333 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1334 {
1335         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1336 }
1337 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1338 {
1339         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1340         command->index = index;
1341         command->val[0] = v0;
1342         command->val[1] = v1;
1343         command->val[2] = v2;
1344         command->val[3] = v3;
1345
1346         dpsoftrast.uniform4f[index*4+0] = v0;
1347         dpsoftrast.uniform4f[index*4+1] = v1;
1348         dpsoftrast.uniform4f[index*4+2] = v2;
1349         dpsoftrast.uniform4f[index*4+3] = v3;
1350 }
1351 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1352 {
1353         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1354         command->index = index;
1355         memcpy(command->val, v, sizeof(command->val));
1356
1357         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1358 }
1359
1360 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1361 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1362 {
1363         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1364 }
1365 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1366 {
1367 #ifdef SSE2_PRESENT
1368         int i, index;
1369         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1370         {
1371                 __m128 m0, m1, m2, m3;
1372                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1373                 command->index = (DPSOFTRAST_UNIFORM)index;
1374                 if (((size_t)v)&(ALIGN_SIZE-1))
1375                 {
1376                         m0 = _mm_loadu_ps(v);
1377                         m1 = _mm_loadu_ps(v+4);
1378                         m2 = _mm_loadu_ps(v+8);
1379                         m3 = _mm_loadu_ps(v+12);
1380                 }
1381                 else
1382                 {
1383                         m0 = _mm_load_ps(v);
1384                         m1 = _mm_load_ps(v+4);
1385                         m2 = _mm_load_ps(v+8);
1386                         m3 = _mm_load_ps(v+12);
1387                 }
1388                 if (transpose)
1389                 {
1390                         __m128 t0, t1, t2, t3;
1391                         t0 = _mm_unpacklo_ps(m0, m1);
1392                         t1 = _mm_unpacklo_ps(m2, m3);
1393                         t2 = _mm_unpackhi_ps(m0, m1);
1394                         t3 = _mm_unpackhi_ps(m2, m3);
1395                         m0 = _mm_movelh_ps(t0, t1);
1396                         m1 = _mm_movehl_ps(t1, t0);
1397                         m2 = _mm_movelh_ps(t2, t3);
1398                         m3 = _mm_movehl_ps(t3, t2);                     
1399                 }
1400                 _mm_store_ps(command->val, m0);
1401                 _mm_store_ps(command->val+4, m1);
1402                 _mm_store_ps(command->val+8, m2);
1403                 _mm_store_ps(command->val+12, m3);
1404                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1405                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1406                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1407                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1408         }
1409 #endif
1410 }
1411
1412 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1413 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1414 {
1415         thread->uniform1i[command->index] = command->val;
1416 }
1417 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1418 {
1419         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1420         command->index = index;
1421         command->val = i0;
1422
1423         dpsoftrast.uniform1i[command->index] = i0;
1424 }
1425
1426 #ifdef SSE2_PRESENT
1427 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1428 {
1429         float *end = dst + size*4;
1430         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1431         {
1432                 while (dst < end)
1433                 {
1434                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1435                         dst += 4;
1436                         src += stride;
1437                 }
1438         }
1439         else
1440         {
1441                 while (dst < end)
1442                 {
1443                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1444                         dst += 4;
1445                         src += stride;
1446                 }
1447         }
1448 }
1449
1450 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1451 {
1452         float *end = dst + size*4;
1453         if (stride == sizeof(float[3]))
1454         {
1455                 float *end4 = dst + (size&~3)*4;        
1456                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1457                 {
1458                         while (dst < end4)
1459                         {
1460                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1461                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1462                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1463                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1464                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1465                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1466                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1467                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1468                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1469                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1470                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1471                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1472                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473                                 dst += 16;
1474                                 src += 4*sizeof(float[3]);
1475                         }
1476                 }
1477                 else
1478                 {
1479                         while (dst < end4)
1480                         {
1481                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1482                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1483                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1484                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1485                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1486                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1487                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1488                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1489                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1490                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1491                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1492                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1493                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1494                                 dst += 16;
1495                                 src += 4*sizeof(float[3]);
1496                         }
1497                 }
1498         }
1499         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1500         {
1501                 while (dst < end)
1502                 {
1503                         __m128 v = _mm_loadu_ps((const float *)src);
1504                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1505                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1506                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1507                         _mm_store_ps(dst, v);
1508                         dst += 4;
1509                         src += stride;
1510                 }
1511         }
1512         else
1513         {
1514                 while (dst < end)
1515                 {
1516                         __m128 v = _mm_load_ps((const float *)src);
1517                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1518                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1519                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1520                         _mm_store_ps(dst, v);
1521                         dst += 4;
1522                         src += stride;
1523                 }
1524         }
1525 }
1526
1527 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1528 {
1529         float *end = dst + size*4;
1530         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1531         if (stride == sizeof(float[2]))
1532         {
1533                 float *end2 = dst + (size&~1)*4;
1534                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1535                 {
1536                         while (dst < end2)
1537                         {
1538                                 __m128 v = _mm_loadu_ps((const float *)src);
1539                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1540                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1541                                 dst += 8;
1542                                 src += 2*sizeof(float[2]);
1543                         }
1544                 }
1545                 else
1546                 {
1547                         while (dst < end2)
1548                         {
1549                                 __m128 v = _mm_load_ps((const float *)src);
1550                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1551                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1552                                 dst += 8;
1553                                 src += 2*sizeof(float[2]);
1554                         }
1555                 }
1556         }
1557         while (dst < end)
1558         {
1559                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1560                 dst += 4;
1561                 src += stride;
1562         }
1563 }
1564
1565 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1566 {
1567         float *end = dst + size*4;
1568         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1569         if (stride == sizeof(unsigned char[4]))
1570         {
1571                 float *end4 = dst + (size&~3)*4;
1572                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1573                 {
1574                         while (dst < end4)
1575                         {
1576                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1577                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1578                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1579                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1580                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1581                                 dst += 16;
1582                                 src += 4*sizeof(unsigned char[4]);
1583                         }
1584                 }
1585                 else
1586                 {
1587                         while (dst < end4)
1588                         {
1589                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1590                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1591                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1592                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1593                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1594                                 dst += 16;
1595                                 src += 4*sizeof(unsigned char[4]);
1596                         }
1597                 }
1598         }
1599         while (dst < end)
1600         {
1601                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1602                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1603                 dst += 4;
1604                 src += stride;
1605         }
1606 }
1607
1608 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1609 {
1610         float *end = dst + 4*size;
1611         __m128 v = _mm_loadu_ps(src);
1612         while (dst < end)
1613         {
1614                 _mm_store_ps(dst, v);
1615                 dst += 4;
1616         }
1617 }
1618 #endif
1619
1620 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1621 {
1622 #ifdef SSE2_PRESENT
1623         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1624         __m128 m0, m1, m2, m3;
1625         float *end;
1626         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1627         {
1628                 // fast case for identity matrix
1629                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1630                 return;
1631         }
1632         end = out4f + numitems*4;
1633         m0 = _mm_loadu_ps(inmatrix16f);
1634         m1 = _mm_loadu_ps(inmatrix16f + 4);
1635         m2 = _mm_loadu_ps(inmatrix16f + 8);
1636         m3 = _mm_loadu_ps(inmatrix16f + 12);
1637         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1638         {
1639                 while (out4f < end)
1640                 {
1641                         __m128 v = _mm_loadu_ps(in4f);
1642                         _mm_store_ps(out4f,
1643                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1644                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1645                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1646                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1647                         out4f += 4;
1648                         in4f += 4;
1649                 }
1650         }
1651         else
1652         {
1653                 while (out4f < end)
1654                 {
1655                         __m128 v = _mm_load_ps(in4f);
1656                         _mm_store_ps(out4f,
1657                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1658                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1659                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1660                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1661                         out4f += 4;
1662                         in4f += 4;
1663                 }
1664         }
1665 #endif
1666 }
1667
1668 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1669 {
1670         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1671 }
1672
1673 #ifdef SSE2_PRESENT
1674 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1675 { \
1676         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1677         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1678         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1679         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1680 }
1681
1682 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1683 { \
1684         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1685         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1686         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1687         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1688 }
1689
1690 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1691 { \
1692         __m128 p = (in); \
1693         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1694                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1695                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1696                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1697 }
1698
1699 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1700 {
1701         int clipmask = 0xFF;
1702         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1703         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1704         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1705         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1706         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1707         #define BBFRONT(k, pos) \
1708         { \
1709                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1710                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1712                 { \
1713                         __m128 proj; \
1714                         clipmask &= ~(1<<k); \
1715                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1716                         minproj = _mm_min_ss(minproj, proj); \
1717                         maxproj = _mm_max_ss(maxproj, proj); \
1718                 } \
1719         }
1720         BBFRONT(0, minpos); 
1721         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1722         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1723         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1724         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1725         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1726         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1727         BBFRONT(7, maxpos);
1728         #define BBCLIP(k) \
1729         { \
1730                 if (clipmask&(1<<k)) \
1731                 { \
1732                         if (!(clipmask&(1<<(k^1)))) \
1733                         { \
1734                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1735                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1736                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1737                                 minproj = _mm_min_ss(minproj, proj); \
1738                                 maxproj = _mm_max_ss(maxproj, proj); \
1739                         } \
1740                         if (!(clipmask&(1<<(k^2)))) \
1741                         { \
1742                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1743                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1744                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1745                                 minproj = _mm_min_ss(minproj, proj); \
1746                                 maxproj = _mm_max_ss(maxproj, proj); \
1747                         } \
1748                         if (!(clipmask&(1<<(k^4)))) \
1749                         { \
1750                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1751                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1752                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1753                                 minproj = _mm_min_ss(minproj, proj); \
1754                                 maxproj = _mm_max_ss(maxproj, proj); \
1755                         } \
1756                 } \
1757         }
1758         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1759         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1760         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1761         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1762         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1763         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1764         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1765         *starty = _mm_cvttss_si32(maxproj);
1766         *endy = _mm_cvttss_si32(minproj)+1;
1767         return clipmask;
1768 }
1769         
1770 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1771 {
1772         float *end = out4f + numitems*4;
1773         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1774         __m128 minpos, maxpos;
1775         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1776         {
1777                 minpos = maxpos = _mm_loadu_ps(in4f);
1778                 while (out4f < end)
1779                 {
1780                         __m128 v = _mm_loadu_ps(in4f);
1781                         minpos = _mm_min_ps(minpos, v);
1782                         maxpos = _mm_max_ps(maxpos, v);
1783                         _mm_store_ps(out4f, v);
1784                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1785                         _mm_store_ps(screen4f, v);
1786                         in4f += 4;
1787                         out4f += 4;
1788                         screen4f += 4;
1789                 }
1790         }
1791         else
1792         {
1793                 minpos = maxpos = _mm_load_ps(in4f);
1794                 while (out4f < end)
1795                 {
1796                         __m128 v = _mm_load_ps(in4f);
1797                         minpos = _mm_min_ps(minpos, v);
1798                         maxpos = _mm_max_ps(maxpos, v);
1799                         _mm_store_ps(out4f, v);
1800                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1801                         _mm_store_ps(screen4f, v);
1802                         in4f += 4;
1803                         out4f += 4;
1804                         screen4f += 4;
1805                 }
1806         }
1807         if (starty && endy) 
1808                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1809                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1810                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1811                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1812                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1813         return 0;
1814 }
1815
1816 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1817 {
1818         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1819         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1820         float *end;
1821         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1822                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1823         end = out4f + numitems*4;
1824         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1825         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1826         m0 = _mm_loadu_ps(inmatrix16f);
1827         m1 = _mm_loadu_ps(inmatrix16f + 4);
1828         m2 = _mm_loadu_ps(inmatrix16f + 8);
1829         m3 = _mm_loadu_ps(inmatrix16f + 12);
1830         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1831         {
1832                 minpos = maxpos = _mm_loadu_ps(in4f);
1833                 while (out4f < end)
1834                 {
1835                         __m128 v = _mm_loadu_ps(in4f);
1836                         minpos = _mm_min_ps(minpos, v);
1837                         maxpos = _mm_max_ps(maxpos, v);
1838                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1839                         _mm_store_ps(out4f, v);
1840                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1841                         _mm_store_ps(screen4f, v);
1842                         in4f += 4;
1843                         out4f += 4;
1844                         screen4f += 4;
1845                 }
1846         }
1847         else
1848         {
1849                 minpos = maxpos = _mm_load_ps(in4f);
1850                 while (out4f < end)
1851                 {
1852                         __m128 v = _mm_load_ps(in4f);
1853                         minpos = _mm_min_ps(minpos, v);
1854                         maxpos = _mm_max_ps(maxpos, v);
1855                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1856                         _mm_store_ps(out4f, v);
1857                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1858                         _mm_store_ps(screen4f, v);
1859                         in4f += 4;
1860                         out4f += 4;
1861                         screen4f += 4;
1862                 }
1863         }
1864         if (starty && endy) 
1865                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1866         return 0;
1867 }
1868 #endif
1869
1870 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1871 {
1872 #ifdef SSE2_PRESENT
1873         float *outf = dpsoftrast.post_array4f[outarray];
1874         const unsigned char *inb;
1875         int firstvertex = dpsoftrast.firstvertex;
1876         int numvertices = dpsoftrast.numvertices;
1877         int stride;
1878         switch(inarray)
1879         {
1880         case DPSOFTRAST_ARRAY_POSITION:
1881                 stride = dpsoftrast.stride_vertex;
1882                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1883                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1884                 break;
1885         case DPSOFTRAST_ARRAY_COLOR:
1886                 stride = dpsoftrast.stride_color;
1887                 if (dpsoftrast.pointer_color4f)
1888                 {
1889                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1890                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1891                 }
1892                 else if (dpsoftrast.pointer_color4ub)
1893                 {
1894                         stride = dpsoftrast.stride_color;
1895                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1896                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1897                 }
1898                 else
1899                 {
1900                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1901                 }
1902                 break;
1903         default:
1904                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1905                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1906                 {
1907                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1908                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1909                         {
1910                         case 2:
1911                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1912                                 break;
1913                         case 3:
1914                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1915                                 break;
1916                         case 4:
1917                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1918                                 break;
1919                         }
1920                 }
1921                 break;
1922         }
1923         return outf;
1924 #else
1925         return NULL;
1926 #endif
1927 }
1928
1929 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1930 {
1931         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1932         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1933         return data;
1934 }
1935
1936 #if 0
1937 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1938 {
1939 #ifdef SSE2_PRESENT
1940         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1941         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1942         return data;
1943 #else
1944         return NULL;
1945 #endif
1946 }
1947 #endif
1948
1949 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1950 {
1951 #ifdef SSE2_PRESENT
1952         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1953         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1954         return data;
1955 #else
1956         return NULL;
1957 #endif
1958 }
1959
1960 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1961 {
1962         int x;
1963         int startx = span->startx;
1964         int endx = span->endx;
1965         float wslope = triangle->w[0];
1966         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1967         float endz = 1.0f / (w + wslope * startx);
1968         for (x = startx;x < endx;)
1969         {
1970                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1971                 float z = endz, dz;
1972                 if (nextsub >= endx) nextsub = endsub = endx-1;
1973                 endz = 1.0f / (w + wslope * nextsub);
1974                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1975                 for (; x <= endsub; x++, z += dz)
1976                         zf[x] = z;
1977         }
1978 }
1979
1980 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1981 {
1982         int x;
1983         int startx = span->startx;
1984         int endx = span->endx;
1985         int d[4];
1986         float a, b;
1987         unsigned char * RESTRICT pixelmask = span->pixelmask;
1988         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1989         if (!pixel)
1990                 return;
1991         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1992         // handle alphatest now (this affects depth writes too)
1993         if (thread->alphatest)
1994                 for (x = startx;x < endx;x++)
1995                         if (in4f[x*4+3] < 0.5f)
1996                                 pixelmask[x] = false;
1997         // FIXME: this does not handle bigendian
1998         switch(thread->fb_blendmode)
1999         {
2000         case DPSOFTRAST_BLENDMODE_OPAQUE:
2001                 for (x = startx;x < endx;x++)
2002                 {
2003                         if (!pixelmask[x])
2004                                 continue;
2005                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2006                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2007                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2008                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2009                         pixel[x*4+0] = d[0];
2010                         pixel[x*4+1] = d[1];
2011                         pixel[x*4+2] = d[2];
2012                         pixel[x*4+3] = d[3];
2013                 }
2014                 break;
2015         case DPSOFTRAST_BLENDMODE_ALPHA:
2016                 for (x = startx;x < endx;x++)
2017                 {
2018                         if (!pixelmask[x])
2019                                 continue;
2020                         a = in4f[x*4+3] * 255.0f;
2021                         b = 1.0f - in4f[x*4+3];
2022                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2023                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2024                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2025                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2026                         pixel[x*4+0] = d[0];
2027                         pixel[x*4+1] = d[1];
2028                         pixel[x*4+2] = d[2];
2029                         pixel[x*4+3] = d[3];
2030                 }
2031                 break;
2032         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2033                 for (x = startx;x < endx;x++)
2034                 {
2035                         if (!pixelmask[x])
2036                                 continue;
2037                         a = in4f[x*4+3] * 255.0f;
2038                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2039                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2040                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2041                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2042                         pixel[x*4+0] = d[0];
2043                         pixel[x*4+1] = d[1];
2044                         pixel[x*4+2] = d[2];
2045                         pixel[x*4+3] = d[3];
2046                 }
2047                 break;
2048         case DPSOFTRAST_BLENDMODE_ADD:
2049                 for (x = startx;x < endx;x++)
2050                 {
2051                         if (!pixelmask[x])
2052                                 continue;
2053                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2054                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2055                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2056                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2057                         pixel[x*4+0] = d[0];
2058                         pixel[x*4+1] = d[1];
2059                         pixel[x*4+2] = d[2];
2060                         pixel[x*4+3] = d[3];
2061                 }
2062                 break;
2063         case DPSOFTRAST_BLENDMODE_INVMOD:
2064                 for (x = startx;x < endx;x++)
2065                 {
2066                         if (!pixelmask[x])
2067                                 continue;
2068                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2069                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2070                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2071                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2072                         pixel[x*4+0] = d[0];
2073                         pixel[x*4+1] = d[1];
2074                         pixel[x*4+2] = d[2];
2075                         pixel[x*4+3] = d[3];
2076                 }
2077                 break;
2078         case DPSOFTRAST_BLENDMODE_MUL:
2079                 for (x = startx;x < endx;x++)
2080                 {
2081                         if (!pixelmask[x])
2082                                 continue;
2083                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2084                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2085                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2086                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2087                         pixel[x*4+0] = d[0];
2088                         pixel[x*4+1] = d[1];
2089                         pixel[x*4+2] = d[2];
2090                         pixel[x*4+3] = d[3];
2091                 }
2092                 break;
2093         case DPSOFTRAST_BLENDMODE_MUL2:
2094                 for (x = startx;x < endx;x++)
2095                 {
2096                         if (!pixelmask[x])
2097                                 continue;
2098                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2099                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2100                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2101                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2102                         pixel[x*4+0] = d[0];
2103                         pixel[x*4+1] = d[1];
2104                         pixel[x*4+2] = d[2];
2105                         pixel[x*4+3] = d[3];
2106                 }
2107                 break;
2108         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2109                 for (x = startx;x < endx;x++)
2110                 {
2111                         if (!pixelmask[x])
2112                                 continue;
2113                         a = in4f[x*4+3] * -255.0f;
2114                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2115                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2116                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2117                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2118                         pixel[x*4+0] = d[0];
2119                         pixel[x*4+1] = d[1];
2120                         pixel[x*4+2] = d[2];
2121                         pixel[x*4+3] = d[3];
2122                 }
2123                 break;
2124         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2125                 for (x = startx;x < endx;x++)
2126                 {
2127                         if (!pixelmask[x])
2128                                 continue;
2129                         a = 255.0f;
2130                         b = 1.0f - in4f[x*4+3];
2131                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2132                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2133                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2134                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2135                         pixel[x*4+0] = d[0];
2136                         pixel[x*4+1] = d[1];
2137                         pixel[x*4+2] = d[2];
2138                         pixel[x*4+3] = d[3];
2139                 }
2140                 break;
2141         case DPSOFTRAST_BLENDMODE_INVADD:
2142                 for (x = startx;x < endx;x++)
2143                 {
2144                         if (!pixelmask[x])
2145                                 continue;
2146                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2147                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2148                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2149                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2150                         pixel[x*4+0] = d[0];
2151                         pixel[x*4+1] = d[1];
2152                         pixel[x*4+2] = d[2];
2153                         pixel[x*4+3] = d[3];
2154                 }
2155                 break;
2156         }
2157 }
2158
2159 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2160 {
2161 #ifdef SSE2_PRESENT
2162         int x;
2163         int startx = span->startx;
2164         int endx = span->endx;
2165         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2166         unsigned char * RESTRICT pixelmask = span->pixelmask;
2167         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2168         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2169         if (!pixel)
2170                 return;
2171         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2172         pixeli += span->y * dpsoftrast.fb_width + span->x;
2173         // handle alphatest now (this affects depth writes too)
2174         if (thread->alphatest)
2175                 for (x = startx;x < endx;x++)
2176                         if (in4ub[x*4+3] < 0.5f)
2177                                 pixelmask[x] = false;
2178         // FIXME: this does not handle bigendian
2179         switch(thread->fb_blendmode)
2180         {
2181         case DPSOFTRAST_BLENDMODE_OPAQUE:
2182                 for (x = startx;x + 4 <= endx;)
2183                 {
2184                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2185                         {
2186                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2187                                 x += 4;
2188                         }
2189                         else
2190                         {
2191                                 if (pixelmask[x])
2192                                         pixeli[x] = ini[x];
2193                                 x++;
2194                         }
2195                 }
2196                 for (;x < endx;x++)
2197                         if (pixelmask[x])
2198                                 pixeli[x] = ini[x];
2199                 break;
2200         case DPSOFTRAST_BLENDMODE_ALPHA:
2201         #define FINISHBLEND(blend2, blend1) \
2202                 for (x = startx;x + 1 < endx;x += 2) \
2203                 { \
2204                         __m128i src, dst; \
2205                         switch (*(const unsigned short*)&pixelmask[x]) \
2206                         { \
2207                         case 0x0101: \
2208                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2209                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2210                                 blend2; \
2211                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2212                                 continue; \
2213                         case 0x0100: \
2214                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2215                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2216                                 blend1; \
2217                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2218                                 continue; \
2219                         case 0x0001: \
2220                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2222                                 blend1; \
2223                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2224                                 continue; \
2225                         } \
2226                         break; \
2227                 } \
2228                 for(;x < endx; x++) \
2229                 { \
2230                         __m128i src, dst; \
2231                         if (!pixelmask[x]) \
2232                                 continue; \
2233                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2234                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2235                         blend1; \
2236                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2237                 }
2238
2239                 FINISHBLEND({
2240                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2241                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2242                 }, {
2243                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2244                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2245                 });
2246                 break;
2247         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2248                 FINISHBLEND({
2249                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2250                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2251                 }, {
2252                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2253                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2254                 });
2255                 break;
2256         case DPSOFTRAST_BLENDMODE_ADD:
2257                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2258                 break;
2259         case DPSOFTRAST_BLENDMODE_INVMOD:
2260                 FINISHBLEND({
2261                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2262                 }, {
2263                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2264                 });
2265                 break;
2266         case DPSOFTRAST_BLENDMODE_MUL:
2267                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2268                 break;
2269         case DPSOFTRAST_BLENDMODE_MUL2:
2270                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2271                 break;
2272         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2273                 FINISHBLEND({
2274                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2275                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2276                 }, {
2277                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2278                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2279                 });
2280                 break;
2281         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2282                 FINISHBLEND({
2283                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2284                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2285                 }, {
2286                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2287                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2288                 });
2289                 break;
2290         case DPSOFTRAST_BLENDMODE_INVADD:
2291                 FINISHBLEND({
2292                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2293                 }, {
2294                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2295                 });
2296                 break;
2297         }
2298 #endif
2299 }
2300
2301 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2302 {
2303         int x;
2304         int startx = span->startx;
2305         int endx = span->endx;
2306         int flags;
2307         float c[4];
2308         float data[4];
2309         float slope[4];
2310         float tc[2], endtc[2];
2311         float tcscale[2];
2312         unsigned int tci[2];
2313         unsigned int tci1[2];
2314         unsigned int tcimin[2];
2315         unsigned int tcimax[2];
2316         int tciwrapmask[2];
2317         int tciwidth;
2318         int filter;
2319         int mip;
2320         const unsigned char * RESTRICT pixelbase;
2321         const unsigned char * RESTRICT pixel[4];
2322         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2323         // if no texture is bound, just fill it with white
2324         if (!texture)
2325         {
2326                 for (x = startx;x < endx;x++)
2327                 {
2328                         out4f[x*4+0] = 1.0f;
2329                         out4f[x*4+1] = 1.0f;
2330                         out4f[x*4+2] = 1.0f;
2331                         out4f[x*4+3] = 1.0f;
2332                 }
2333                 return;
2334         }
2335         mip = triangle->mip[texunitindex];
2336         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2337         // if this mipmap of the texture is 1 pixel, just fill it with that color
2338         if (texture->mipmap[mip][1] == 4)
2339         {
2340                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2341                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2342                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2343                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2344                 for (x = startx;x < endx;x++)
2345                 {
2346                         out4f[x*4+0] = c[0];
2347                         out4f[x*4+1] = c[1];
2348                         out4f[x*4+2] = c[2];
2349                         out4f[x*4+3] = c[3];
2350                 }
2351                 return;
2352         }
2353         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2354         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2355         flags = texture->flags;
2356         tcscale[0] = texture->mipmap[mip][2];
2357         tcscale[1] = texture->mipmap[mip][3];
2358         tciwidth = texture->mipmap[mip][2];
2359         tcimin[0] = 0;
2360         tcimin[1] = 0;
2361         tcimax[0] = texture->mipmap[mip][2]-1;
2362         tcimax[1] = texture->mipmap[mip][3]-1;
2363         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2364         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2365         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2366         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2367         for (x = startx;x < endx;)
2368         {
2369                 unsigned int subtc[2];
2370                 unsigned int substep[2];
2371                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2372                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2373                 if (nextsub >= endx)
2374                 {
2375                         nextsub = endsub = endx-1;      
2376                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2377                 }
2378                 tc[0] = endtc[0];
2379                 tc[1] = endtc[1];
2380                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2381                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2382                 substep[0] = (endtc[0] - tc[0]) * subscale;
2383                 substep[1] = (endtc[1] - tc[1]) * subscale;
2384                 subtc[0] = tc[0] * (1<<16);
2385                 subtc[1] = tc[1] * (1<<16);
2386                 if (filter)
2387                 {
2388                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2389                         {
2390                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2391                                 {
2392                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2393                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2394                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2395                                         tci[0] = subtc[0]>>16;
2396                                         tci[1] = subtc[1]>>16;
2397                                         tci1[0] = tci[0] + 1;
2398                                         tci1[1] = tci[1] + 1;
2399                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2400                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2401                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2402                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2403                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2404                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2405                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2406                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2407                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2408                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2409                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2410                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2411                                         out4f[x*4+0] = c[0];
2412                                         out4f[x*4+1] = c[1];
2413                                         out4f[x*4+2] = c[2];
2414                                         out4f[x*4+3] = c[3];
2415                                 }
2416                         }
2417                         else
2418                         {
2419                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2420                                 {
2421                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2422                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2423                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2424                                         tci[0] = subtc[0]>>16;
2425                                         tci[1] = subtc[1]>>16;
2426                                         tci1[0] = tci[0] + 1;
2427                                         tci1[1] = tci[1] + 1;
2428                                         tci[0] &= tciwrapmask[0];
2429                                         tci[1] &= tciwrapmask[1];
2430                                         tci1[0] &= tciwrapmask[0];
2431                                         tci1[1] &= tciwrapmask[1];
2432                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2433                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2434                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2435                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2436                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2437                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2438                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2439                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2440                                         out4f[x*4+0] = c[0];
2441                                         out4f[x*4+1] = c[1];
2442                                         out4f[x*4+2] = c[2];
2443                                         out4f[x*4+3] = c[3];
2444                                 }
2445                         }
2446                 }
2447                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2448                 {
2449                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2450                         {
2451                                 tci[0] = subtc[0]>>16;
2452                                 tci[1] = subtc[1]>>16;
2453                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2454                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2455                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2456                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2457                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2458                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2459                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2460                                 out4f[x*4+0] = c[0];
2461                                 out4f[x*4+1] = c[1];
2462                                 out4f[x*4+2] = c[2];
2463                                 out4f[x*4+3] = c[3];
2464                         }
2465                 }
2466                 else
2467                 {
2468                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2469                         {
2470                                 tci[0] = subtc[0]>>16;
2471                                 tci[1] = subtc[1]>>16;
2472                                 tci[0] &= tciwrapmask[0];
2473                                 tci[1] &= tciwrapmask[1];
2474                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2475                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2476                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2477                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2478                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2479                                 out4f[x*4+0] = c[0];
2480                                 out4f[x*4+1] = c[1];
2481                                 out4f[x*4+2] = c[2];
2482                                 out4f[x*4+3] = c[3];
2483                         }
2484                 }
2485         }
2486 }
2487
2488 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2489 {
2490 #ifdef SSE2_PRESENT
2491         int x;
2492         int startx = span->startx;
2493         int endx = span->endx;
2494         int flags;
2495         __m128 data, slope, tcscale;
2496         __m128i tcsize, tcmask, tcoffset, tcmax;
2497         __m128 tc, endtc;
2498         __m128i subtc, substep, endsubtc;
2499         int filter;
2500         int mip;
2501         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2502         const unsigned char * RESTRICT pixelbase;
2503         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2504         // if no texture is bound, just fill it with white
2505         if (!texture)
2506         {
2507                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2508                 return;
2509         }
2510         mip = triangle->mip[texunitindex];
2511         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2512         // if this mipmap of the texture is 1 pixel, just fill it with that color
2513         if (texture->mipmap[mip][1] == 4)
2514         {
2515                 unsigned int k = *((const unsigned int *)pixelbase);
2516                 for (x = startx;x < endx;x++)
2517                         outi[x] = k;
2518                 return;
2519         }
2520         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2521         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2522         flags = texture->flags;
2523         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2524         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2525         tcscale = _mm_cvtepi32_ps(tcsize);
2526         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2527         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2528         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2529         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2530         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2531         tcmax = _mm_packs_epi32(tcmask, tcmask);
2532         for (x = startx;x < endx;)
2533         {
2534                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2535                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2536                 if (nextsub >= endx)
2537                 {
2538                         nextsub = endsub = endx-1;
2539                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2540                 }       
2541                 tc = endtc;
2542                 subtc = endsubtc;
2543                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2544                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2545                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2546                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2547                 substep = _mm_slli_epi32(substep, 1);
2548                 if (filter)
2549                 {
2550                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2551                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2552                         {
2553                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2554                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2555                                 {
2556                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2557                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2558                                         tci = _mm_madd_epi16(tci, tcoffset);
2559                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2560                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2561                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2562                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2563                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2564                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2565                                         fracm = _mm_srli_epi16(subtc, 1);
2566                                         pix1 = _mm_add_epi16(pix1,
2567                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2568                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2569                                         pix3 = _mm_add_epi16(pix3,
2570                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2571                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2572                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2573                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2574                                         pix2 = _mm_add_epi16(pix2,
2575                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2576                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2577                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2578                                 }
2579                                 if (x <= endsub)
2580                                 {
2581                                         const unsigned char * RESTRICT ptr1;
2582                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2583                                         tci = _mm_madd_epi16(tci, tcoffset);
2584                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2585                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2586                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2587                                         fracm = _mm_srli_epi16(subtc, 1);
2588                                         pix1 = _mm_add_epi16(pix1,
2589                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2590                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2591                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2592                                         pix1 = _mm_add_epi16(pix1,
2593                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2594                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2595                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2596                                         x++;
2597                                 }
2598                         }
2599                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2600                         {
2601                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2602                                 {
2603                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2604                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2605                                         tci = _mm_madd_epi16(tci, tcoffset);
2606                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2607                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2608                                                                                         _mm_setzero_si128());
2609                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2610                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2611                                                                                         _mm_setzero_si128());
2612                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2613                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2614                                         tci = _mm_madd_epi16(tci, tcoffset);
2615                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2616                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2617                                                                                         _mm_setzero_si128());
2618                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2619                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2620                                                                                         _mm_setzero_si128());
2621                                         fracm = _mm_srli_epi16(subtc, 1);
2622                                         pix1 = _mm_add_epi16(pix1,
2623                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2624                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2625                                         pix3 = _mm_add_epi16(pix3,
2626                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2627                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2628                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2629                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2630                                         pix2 = _mm_add_epi16(pix2,
2631                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2632                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2633                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2634                                 }
2635                                 if (x <= endsub)
2636                                 {
2637                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2638                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2639                                         tci = _mm_madd_epi16(tci, tcoffset);
2640                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2641                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2642                                                                                         _mm_setzero_si128());
2643                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2644                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2645                                                                                         _mm_setzero_si128());
2646                                         fracm = _mm_srli_epi16(subtc, 1);
2647                                         pix1 = _mm_add_epi16(pix1,
2648                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2649                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2650                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2651                                         pix1 = _mm_add_epi16(pix1,
2652                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2653                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2654                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2655                                         x++;
2656                                 }
2657                         }
2658                         else
2659                         {
2660                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2661                                 {
2662                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2663                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2664                                         tci = _mm_madd_epi16(tci, tcoffset);
2665                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2666                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2667                                                                                         _mm_setzero_si128());
2668                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2669                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2670                                                                                         _mm_setzero_si128());
2671                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2672                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2673                                         tci = _mm_madd_epi16(tci, tcoffset);
2674                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2675                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2676                                                                                         _mm_setzero_si128());
2677                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2678                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2679                                                                                         _mm_setzero_si128());
2680                                         fracm = _mm_srli_epi16(subtc, 1);
2681                                         pix1 = _mm_add_epi16(pix1,
2682                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2683                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2684                                         pix3 = _mm_add_epi16(pix3,
2685                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2686                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2687                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2688                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2689                                         pix2 = _mm_add_epi16(pix2,
2690                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2691                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2692                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2693                                 }
2694                                 if (x <= endsub)
2695                                 {
2696                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2697                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2698                                         tci = _mm_madd_epi16(tci, tcoffset);
2699                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2700                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2701                                                                                         _mm_setzero_si128());
2702                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2703                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2704                                                                                         _mm_setzero_si128());
2705                                         fracm = _mm_srli_epi16(subtc, 1);
2706                                         pix1 = _mm_add_epi16(pix1,
2707                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2708                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2709                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2710                                         pix1 = _mm_add_epi16(pix1,
2711                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2712                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2713                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2714                                         x++;
2715                                 }
2716                         }
2717                 }
2718                 else
2719                 {
2720                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2721                         {
2722                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2723                                 {
2724                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2725                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2726                                         tci = _mm_madd_epi16(tci, tcoffset);
2727                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2728                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2729                                 }
2730                                 if (x <= endsub)
2731                                 {
2732                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2733                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2734                                         tci = _mm_madd_epi16(tci, tcoffset);
2735                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2736                                         x++;
2737                                 }
2738                         }
2739                         else
2740                         {
2741                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2742                                 {
2743                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2744                                         tci = _mm_and_si128(tci, tcmax); 
2745                                         tci = _mm_madd_epi16(tci, tcoffset);
2746                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2747                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2748                                 }
2749                                 if (x <= endsub)
2750                                 {
2751                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2752                                         tci = _mm_and_si128(tci, tcmax); 
2753                                         tci = _mm_madd_epi16(tci, tcoffset);
2754                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2755                                         x++;
2756                                 }
2757                         }
2758                 }
2759         }
2760 #endif
2761 }
2762
2763 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2764 {
2765         // TODO: IMPLEMENT
2766         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2767 }
2768
2769 float DPSOFTRAST_SampleShadowmap(const float *vector)
2770 {
2771         // TODO: IMPLEMENT
2772         return 1.0f;
2773 }
2774
2775 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2776 {
2777         int x;
2778         int startx = span->startx;
2779         int endx = span->endx;
2780         float c[4];
2781         float data[4];
2782         float slope[4];
2783         float z;
2784         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2785         for (x = startx;x < endx;x++)
2786         {
2787                 z = zf[x];
2788                 c[0] = (data[0] + slope[0]*x) * z;
2789                 c[1] = (data[1] + slope[1]*x) * z;
2790                 c[2] = (data[2] + slope[2]*x) * z;
2791                 c[3] = (data[3] + slope[3]*x) * z;
2792                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2793                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2794                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2795                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2796         }
2797 }
2798
2799 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2800 {
2801         int x;
2802         int startx = span->startx;
2803         int endx = span->endx;
2804         float c[4];
2805         float data[4];
2806         float slope[4];
2807         float z;
2808         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2809         for (x = startx;x < endx;x++)
2810         {
2811                 z = zf[x];
2812                 c[0] = (data[0] + slope[0]*x) * z;
2813                 c[1] = (data[1] + slope[1]*x) * z;
2814                 c[2] = (data[2] + slope[2]*x) * z;
2815                 c[3] = (data[3] + slope[3]*x) * z;
2816                 out4f[x*4+0] = c[0];
2817                 out4f[x*4+1] = c[1];
2818                 out4f[x*4+2] = c[2];
2819                 out4f[x*4+3] = c[3];
2820         }
2821 }
2822
2823 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2824 {
2825         int x, startx = span->startx, endx = span->endx;
2826         float c[4], localcolor[4];
2827         localcolor[0] = subcolor[0];
2828         localcolor[1] = subcolor[1];
2829         localcolor[2] = subcolor[2];
2830         localcolor[3] = subcolor[3];
2831         for (x = startx;x < endx;x++)
2832         {
2833                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2834                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2835                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2836                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2837                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2838                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2839                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2840                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2841         }
2842 }
2843
2844 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2845 {
2846         int x, startx = span->startx, endx = span->endx;
2847         for (x = startx;x < endx;x++)
2848         {
2849                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2850                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2851                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2852                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2853         }
2854 }
2855
2856 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2857 {
2858         int x, startx = span->startx, endx = span->endx;
2859         for (x = startx;x < endx;x++)
2860         {
2861                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2862                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2863                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2864                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2865         }
2866 }
2867
2868 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2869 {
2870         int x, startx = span->startx, endx = span->endx;
2871         float a, b;
2872         for (x = startx;x < endx;x++)
2873         {
2874                 a = 1.0f - inb4f[x*4+3];
2875                 b = inb4f[x*4+3];
2876                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2877                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2878                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2879                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2880         }
2881 }
2882
2883 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2884 {
2885         int x, startx = span->startx, endx = span->endx;
2886         float localcolor[4], ilerp, lerp;
2887         localcolor[0] = color[0];
2888         localcolor[1] = color[1];
2889         localcolor[2] = color[2];
2890         localcolor[3] = color[3];
2891         ilerp = 1.0f - localcolor[3];
2892         lerp = localcolor[3];
2893         for (x = startx;x < endx;x++)
2894         {
2895                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2896                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2897                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2898                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2899         }
2900 }
2901
2902
2903
2904 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2905 {
2906 #ifdef SSE2_PRESENT
2907         int x;
2908         int startx = span->startx;
2909         int endx = span->endx;
2910         __m128 data, slope;
2911         __m128 mod, endmod;
2912         __m128i submod, substep, endsubmod;
2913         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2914         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2915         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2916         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2917         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2918         for (x = startx; x < endx;)
2919         {
2920                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2921                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2922                 if (nextsub >= endx)
2923                 {
2924                         nextsub = endsub = endx-1;
2925                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2926                 }
2927                 mod = endmod;
2928                 submod = endsubmod;
2929                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2930                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2931                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2932                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2933                 substep = _mm_packs_epi32(substep, substep);
2934                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2935                 {
2936                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2937                         pix = _mm_mulhi_epu16(pix, submod);
2938                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2939                 }
2940                 if (x <= endsub)
2941                 {
2942                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2943                         pix = _mm_mulhi_epu16(pix, submod);
2944                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2945                         x++;
2946                 }
2947         }
2948 #endif
2949 }
2950
2951 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2952 {
2953 #ifdef SSE2_PRESENT
2954         int x;
2955         int startx = span->startx;
2956         int endx = span->endx;
2957         __m128 data, slope;
2958         __m128 mod, endmod;
2959         __m128i submod, substep, endsubmod;
2960         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2961         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2962         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2963         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2964         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2965         for (x = startx; x < endx;)
2966         {
2967                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2968                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2969                 if (nextsub >= endx)
2970                 {
2971                         nextsub = endsub = endx-1;
2972                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2973                 }
2974                 mod = endmod;
2975                 submod = endsubmod;
2976                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2977                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2978                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2979                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2980                 substep = _mm_packs_epi32(substep, substep);
2981                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2982                 {
2983                         __m128i pix = _mm_srai_epi16(submod, 4);
2984                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2985                 }
2986                 if (x <= endsub)
2987                 {
2988                         __m128i pix = _mm_srai_epi16(submod, 4);
2989                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2990                         x++;
2991                 }
2992         }
2993 #endif
2994 }
2995
2996 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2997 {
2998 #ifdef SSE2_PRESENT
2999         int x, startx = span->startx, endx = span->endx;
3000         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3001         localcolor = _mm_packs_epi32(localcolor, localcolor);
3002         for (x = startx;x+2 <= endx;x+=2)
3003         {
3004                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3005                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3006                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3007                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3008         }
3009         if (x < endx)
3010         {
3011                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3012                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3013                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3014                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3015         }
3016 #endif
3017 }
3018
3019 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3020 {
3021 #ifdef SSE2_PRESENT
3022         int x, startx = span->startx, endx = span->endx;
3023         for (x = startx;x+2 <= endx;x+=2)
3024         {
3025                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3026                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3027                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3028                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3029         }
3030         if (x < endx)
3031         {
3032                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3033                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3034                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3035                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3036         }
3037 #endif
3038 }
3039
3040 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3041 {
3042 #ifdef SSE2_PRESENT
3043         int x, startx = span->startx, endx = span->endx;
3044         for (x = startx;x+2 <= endx;x+=2)
3045         {
3046                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3047                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3048                 pix1 = _mm_add_epi16(pix1, pix2);
3049                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3050         }
3051         if (x < endx)
3052         {
3053                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3054                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3055                 pix1 = _mm_add_epi16(pix1, pix2);
3056                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3057         }
3058 #endif
3059 }
3060
3061 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3062 {
3063 #ifdef SSE2_PRESENT
3064         int x, startx = span->startx, endx = span->endx;
3065         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3066         tint = _mm_packs_epi32(tint, tint);
3067         for (x = startx;x+2 <= endx;x+=2)
3068         {
3069                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3070                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3071                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3072                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3073         }
3074         if (x < endx)
3075         {
3076                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3077                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3078                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3079                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3080         }
3081 #endif
3082 }
3083
3084 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3085 {
3086 #ifdef SSE2_PRESENT
3087         int x, startx = span->startx, endx = span->endx;
3088         for (x = startx;x+2 <= endx;x+=2)
3089         {
3090                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3091                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3092                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3093                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3094                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3095         }
3096         if (x < endx)
3097         {
3098                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3099                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3100                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3101                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3102                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3103         }
3104 #endif
3105 }
3106
3107 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3108 {
3109 #ifdef SSE2_PRESENT
3110         int x, startx = span->startx, endx = span->endx;
3111         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3112         localcolor = _mm_packs_epi32(localcolor, localcolor);
3113         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3114         for (x = startx;x+2 <= endx;x+=2)
3115         {
3116                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3117                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3118                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3119         }
3120         if (x < endx)
3121         {
3122                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3123                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3124                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3125         }
3126 #endif
3127 }
3128
3129
3130
3131 void DPSOFTRAST_VertexShader_Generic(void)
3132 {
3133         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3134         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3135         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3136         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3137                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3138 }
3139
3140 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3141 {
3142         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3143         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3144         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3145         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3146         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3147         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3148         {
3149                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3150                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3151                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3152                 {
3153                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3154                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3155                         {
3156                                 // multiply
3157                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3158                         }
3159                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3160                         {
3161                                 // add
3162                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3163                         }
3164                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3165                         {
3166                                 // alphablend
3167                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3168                         }
3169                 }
3170         }
3171         else
3172                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3173         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3174 }
3175
3176
3177
3178 void DPSOFTRAST_VertexShader_PostProcess(void)
3179 {
3180         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3181         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3182         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3183 }
3184
3185 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3186 {
3187         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3188         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3189         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3190         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3191         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3192         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3193         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3194         {
3195                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3196                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3197         }
3198         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3199         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3200         {
3201                 // TODO: implement saturation
3202         }
3203         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3204         {
3205                 // TODO: implement gammaramps
3206         }
3207         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3208 }
3209
3210
3211
3212 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3213 {
3214         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3215 }
3216
3217 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3218 {
3219         // this is never called (because colormask is off when this shader is used)
3220         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3221         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3222         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3223         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3224         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3225 }
3226
3227
3228
3229 void DPSOFTRAST_VertexShader_FlatColor(void)
3230 {
3231         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3232         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3233 }
3234
3235 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3236 {
3237 #ifdef SSE2_PRESENT
3238         unsigned char * RESTRICT pixelmask = span->pixelmask;
3239         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3240         int x, startx = span->startx, endx = span->endx;
3241         __m128i Color_Ambientm;
3242         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3243         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3244         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3245         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3246         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3247         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3248                 pixel = buffer_FragColorbgra8;
3249         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3250         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3251         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3252         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3253         for (x = startx;x < endx;x++)
3254         {
3255                 __m128i color, pix;
3256                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3257                 {
3258                         __m128i pix2;
3259                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3260                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3261                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3262                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3263                         x += 3;
3264                         continue;
3265                 }
3266                 if (!pixelmask[x])
3267                         continue;
3268                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3269                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3270                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3271         }
3272         if (pixel == buffer_FragColorbgra8)
3273                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3274 #endif
3275 }
3276
3277
3278
3279 void DPSOFTRAST_VertexShader_VertexColor(void)
3280 {
3281         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3282         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3283         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3284 }
3285
3286 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3287 {
3288 #ifdef SSE2_PRESENT
3289         unsigned char * RESTRICT pixelmask = span->pixelmask;
3290         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3291         int x, startx = span->startx, endx = span->endx;
3292         __m128i Color_Ambientm, Color_Diffusem;
3293         __m128 data, slope;
3294         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3295         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3296         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3297         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3298         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3299         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3300         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3301                 pixel = buffer_FragColorbgra8;
3302         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3303         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3304         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3305         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3306         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3307         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3308         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3309         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3310         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3311         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3312         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3313         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3314         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3315         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3316         {
3317                 __m128i color, mod, pix;
3318                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3319                 {
3320                         __m128i pix2, mod2;
3321                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3322                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3323                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3324                         data = _mm_add_ps(data, slope);
3325                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3326                         data = _mm_add_ps(data, slope);
3327                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3328                         data = _mm_add_ps(data, slope);
3329                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3330                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3331                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3332                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3333                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3334                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3335                         x += 3;
3336                         continue;
3337                 }
3338                 if (!pixelmask[x])
3339                         continue;
3340                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3341                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3342                 mod = _mm_packs_epi32(mod, mod);
3343                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3344                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3345         }
3346         if (pixel == buffer_FragColorbgra8)
3347                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3348 #endif
3349 }
3350
3351
3352
3353 void DPSOFTRAST_VertexShader_Lightmap(void)
3354 {
3355         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3356         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3357         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3358 }
3359
3360 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3361 {
3362 #ifdef SSE2_PRESENT
3363         unsigned char * RESTRICT pixelmask = span->pixelmask;
3364         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3365         int x, startx = span->startx, endx = span->endx;
3366         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3367         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3368         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3369         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3370         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3371         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3372         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3373         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3374         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3375         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3376                 pixel = buffer_FragColorbgra8;
3377         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3378         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3379         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3380         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3381         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3382         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3383         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3384         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3385         {
3386                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3387                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3388                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3389                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3390                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3391                 for (x = startx;x < endx;x++)
3392                 {
3393                         __m128i color, lightmap, glow, pix;
3394                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3395                         {
3396                                 __m128i pix2;
3397                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3398                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3399                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3400                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3401                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3402                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3403                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3404                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3405                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3406                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3407                                 x += 3;
3408                                 continue;
3409                         }
3410                         if (!pixelmask[x])
3411                                 continue;
3412                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3413                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3414                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3415                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3416                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3417                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3418                 }
3419         }
3420         else
3421         {
3422                 for (x = startx;x < endx;x++)
3423                 {
3424                         __m128i color, lightmap, pix;
3425                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3426                         {
3427                                 __m128i pix2;
3428                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3429                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3430                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3431                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3432                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3433                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3434                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3435                                 x += 3;
3436                                 continue;
3437                         }
3438                         if (!pixelmask[x]) 
3439                                 continue;
3440                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3441                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3442                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3443                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3444                 }
3445         }
3446         if (pixel == buffer_FragColorbgra8)
3447                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3448 #endif
3449 }
3450
3451
3452 void DPSOFTRAST_VertexShader_LightDirection(void);
3453 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3454
3455 void DPSOFTRAST_VertexShader_FakeLight(void)
3456 {
3457         DPSOFTRAST_VertexShader_LightDirection();
3458 }
3459
3460 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3461 {
3462         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3463 }
3464
3465
3466
3467 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3468 {
3469         DPSOFTRAST_VertexShader_LightDirection();
3470         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3471 }
3472
3473 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3474 {
3475         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3476 }
3477
3478
3479
3480 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3481 {
3482         DPSOFTRAST_VertexShader_LightDirection();
3483         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3484 }
3485
3486 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3487 {
3488         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3489 }
3490
3491
3492
3493 void DPSOFTRAST_VertexShader_LightDirection(void)
3494 {
3495         int i;
3496         int numvertices = dpsoftrast.numvertices;
3497         float LightDir[4];
3498         float LightVector[4];
3499         float EyePosition[4];
3500         float EyeVectorModelSpace[4];
3501         float EyeVector[4];
3502         float position[4];
3503         float svector[4];
3504         float tvector[4];
3505         float normal[4];
3506         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3507         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3508         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3509         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3510         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3511         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3512         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3513         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3514         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3515         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3516         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3517         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3518         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3519         for (i = 0;i < numvertices;i++)
3520         {
3521                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3522                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3523                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3524                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3525                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3526                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3527                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3528                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3529                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3530                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3531                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3532                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3533                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3534                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3535                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3536                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3537                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3538                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3539                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3540                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3541                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3542                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3543                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3544                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3545                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3546                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3547                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3548                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3549                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3550         }
3551         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3552 }
3553
3554 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3555 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3556 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3557 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3558 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3559 #define DPSOFTRAST_Vector3Normalize(v)\
3560 do\
3561 {\
3562         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3563         if (len)\
3564         {\
3565                 len = 1.0f / len;\
3566                 v[0] *= len;\
3567                 v[1] *= len;\
3568                 v[2] *= len;\
3569         }\
3570 }\
3571 while(0)
3572
3573 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3574 {
3575         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3576         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3577         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3578         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3579         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3580         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3581         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3582         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3583         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3584         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3585         int x, startx = span->startx, endx = span->endx;
3586         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3587         float LightVectordata[4];
3588         float LightVectorslope[4];
3589         float EyeVectordata[4];
3590         float EyeVectorslope[4];
3591         float VectorSdata[4];
3592         float VectorSslope[4];
3593         float VectorTdata[4];
3594         float VectorTslope[4];
3595         float VectorRdata[4];
3596         float VectorRslope[4];
3597         float z;
3598         float diffusetex[4];
3599         float glosstex[4];
3600         float surfacenormal[4];
3601         float lightnormal[4];
3602         float lightnormal_modelspace[4];
3603         float eyenormal[4];
3604         float specularnormal[4];
3605         float diffuse;
3606         float specular;
3607         float SpecularPower;
3608         int d[4];
3609         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3610         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3611         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3612         Color_Glow[3] = 0.0f;
3613         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3614         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3615         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3616         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3617         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3618         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3619         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3620         Color_Pants[3] = 0.0f;
3621         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3622         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3623         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3624         Color_Shirt[3] = 0.0f;
3625         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3626         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3627         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3628         {
3629                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3630                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3631         }
3632         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3633         {
3634                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3635         }
3636         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3637         {
3638                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3639                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3640                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3641                 Color_Diffuse[3] = 0.0f;
3642                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3643                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3644                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3645                 LightColor[3] = 0.0f;
3646                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3647                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3648                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3649                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3650                 Color_Specular[3] = 0.0f;
3651                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3652                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3653                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3654
3655                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3656                 {
3657                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3658                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3659                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3660                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3661                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3662                 }
3663                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3664                 {
3665                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3666                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3667                 }
3668                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3669                 {
3670                         // nothing of this needed
3671                 }
3672                 else
3673                 {
3674                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3675                 }
3676
3677                 for (x = startx;x < endx;x++)
3678                 {
3679                         z = buffer_z[x];
3680                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3681                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3682                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3683                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3684                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3685                         {
3686                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3687                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3688                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3689                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3690                         }
3691                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3692                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3693                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3694                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3695                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3696                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3697                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3698                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3699
3700                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3701                         {
3702                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3703                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3704                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3705                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3706
3707                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3708                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3709                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3710                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3711
3712                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3713                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3714                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3715                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3716
3717                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3718                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3719                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3720                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3721
3722                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3723                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3724
3725                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3726                                 {
3727                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3728                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3729                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3730                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3731                                 }
3732                         }
3733                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3734                         {
3735                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3736                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3737                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3738                                 {
3739                                         float f = 1.0f / 256.0f;
3740                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3741                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3742                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3743                                 }
3744                         }
3745                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3746                         {
3747                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3748                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3749                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3750                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3751
3752                                 LightColor[0] = 1.0;
3753                                 LightColor[1] = 1.0;
3754                                 LightColor[2] = 1.0;
3755                         }
3756                         else
3757                         {
3758                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3759                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3760                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3761                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3762                         }
3763
3764                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3765
3766                         if(thread->shader_exactspecularmath)
3767                         {
3768                                 // reflect lightnormal at surfacenormal, take the negative of that
3769                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3770                                 float f;
3771                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3772                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3773                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3774                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3775
3776                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3777                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3778                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3779                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3780                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3781
3782                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3783                         }
3784                         else
3785                         {
3786                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3787                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3788                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3789                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3790
3791                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3792                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3793                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3794                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3795
3796                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3797                         }
3798
3799                         specular = pow(specular, SpecularPower * glosstex[3]);
3800                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3801                         {
3802                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3803                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3804                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3805                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3806                         }
3807                         else
3808                         {
3809                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3810                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3811                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3812                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3813                         }
3814
3815                         buffer_FragColorbgra8[x*4+0] = d[0];
3816                         buffer_FragColorbgra8[x*4+1] = d[1];
3817                         buffer_FragColorbgra8[x*4+2] = d[2];
3818                         buffer_FragColorbgra8[x*4+3] = d[3];
3819                 }
3820         }
3821         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3822         {
3823                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3824                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3825                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3826                 Color_Diffuse[3] = 0.0f;
3827                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3828                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3829                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3830                 LightColor[3] = 0.0f;
3831                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3832
3833                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3834                 {
3835                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3836                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3837                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3838                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3839                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3840                 }
3841                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3842                 {
3843                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3844                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3845                 }
3846                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3847                 {
3848                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3849                 }
3850                 else
3851                 {
3852                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3853                 }
3854
3855                 for (x = startx;x < endx;x++)
3856                 {
3857                         z = buffer_z[x];
3858                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3859                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3860                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3861                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3862                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3863                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3864                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3865                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3866
3867                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3868                         {
3869                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3870                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3871                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3872                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3873
3874                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3875                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3876                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3877                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3878
3879                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3880                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3881                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3882                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3883
3884                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3885                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3886                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3887                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3888
3889                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3890                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3891
3892                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3893                                 {
3894                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3895                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3896                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3897                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3898                                 }
3899                         }
3900                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3901                         {
3902                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3903                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3904                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3905                                 {
3906                                         float f = 1.0f / 256.0f;
3907                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3908                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3909                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3910                                 }
3911                         }
3912                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3913                         {
3914                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3915                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3916                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3917                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3918
3919                                 LightColor[0] = 1.0;
3920                                 LightColor[1] = 1.0;
3921                                 LightColor[2] = 1.0;
3922                         }
3923                         else
3924                         {
3925                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3926                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3927                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3928                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3929                         }
3930
3931                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3932                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3933                         {
3934                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3935                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3936                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3937                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3938                         }
3939                         else
3940                         {
3941                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3942                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3943                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3944                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3945                         }
3946                         buffer_FragColorbgra8[x*4+0] = d[0];
3947                         buffer_FragColorbgra8[x*4+1] = d[1];
3948                         buffer_FragColorbgra8[x*4+2] = d[2];
3949                         buffer_FragColorbgra8[x*4+3] = d[3];
3950                 }
3951         }
3952         else
3953         {
3954                 for (x = startx;x < endx;x++)
3955                 {
3956                         z = buffer_z[x];
3957                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3958                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3959                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3960                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3961
3962                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3963                         {
3964                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3965                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3966                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3967                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3968                         }
3969                         else
3970                         {
3971                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3972                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3973                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3974                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3975                         }
3976                         buffer_FragColorbgra8[x*4+0] = d[0];
3977                         buffer_FragColorbgra8[x*4+1] = d[1];
3978                         buffer_FragColorbgra8[x*4+2] = d[2];
3979                         buffer_FragColorbgra8[x*4+3] = d[3];
3980                 }
3981         }
3982         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3983 }
3984
3985
3986
3987 void DPSOFTRAST_VertexShader_LightSource(void)
3988 {
3989         int i;
3990         int numvertices = dpsoftrast.numvertices;
3991         float LightPosition[4];
3992         float LightVector[4];
3993         float LightVectorModelSpace[4];
3994         float EyePosition[4];
3995         float EyeVectorModelSpace[4];
3996         float EyeVector[4];
3997         float position[4];
3998         float svector[4];
3999         float tvector[4];
4000         float normal[4];
4001         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4002         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4003         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4004         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4005         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4006         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4007         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4008         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4009         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4010         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4011         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4012         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4013         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4014         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4015         for (i = 0;i < numvertices;i++)
4016         {
4017                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4018                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4019                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4020                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4021                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4022                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4023                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4024                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4025                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4026                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4027                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4028                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4029                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4030                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4031                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4032                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4033                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4034                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4035                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4036                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4037                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4038                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4039                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4040                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4041                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4042                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4043                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4044                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4045                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4046                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4047                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4048                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4049         }
4050         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4051         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4052 }
4053
4054 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4055 {
4056 #ifdef SSE2_PRESENT
4057         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4058         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4059         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4060         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4061         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4062         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4063         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4064         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4065         int x, startx = span->startx, endx = span->endx;
4066         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4067         float CubeVectordata[4];
4068         float CubeVectorslope[4];
4069         float LightVectordata[4];
4070         float LightVectorslope[4];
4071         float EyeVectordata[4];
4072         float EyeVectorslope[4];
4073         float z;
4074         float diffusetex[4];
4075         float glosstex[4];
4076         float surfacenormal[4];
4077         float lightnormal[4];
4078         float eyenormal[4];
4079         float specularnormal[4];
4080         float diffuse;
4081         float specular;
4082         float SpecularPower;
4083         float CubeVector[4];
4084         float attenuation;
4085         int d[4];
4086         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4087         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4088         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4089         Color_Glow[3] = 0.0f;
4090         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4091         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4092         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4093         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4094         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4095         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4096         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4097         Color_Diffuse[3] = 0.0f;
4098         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4099         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4100         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4101         Color_Specular[3] = 0.0f;
4102         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4103         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4104         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4105         Color_Pants[3] = 0.0f;
4106         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4107         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4108         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4109         Color_Shirt[3] = 0.0f;
4110         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4111         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4112         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4113         LightColor[3] = 0.0f;
4114         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4115         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4116         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4117         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4118         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4119         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4120         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4121         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4122         {
4123                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4124                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4125         }
4126         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4127                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4128         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4129         {
4130                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4131                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4132                 for (x = startx;x < endx;x++)
4133                 {
4134                         z = buffer_z[x];
4135                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4136                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4137                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4138                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4139                         if (attenuation < 0.01f)
4140                                 continue;
4141                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4142                         {
4143                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4144                                 if (attenuation < 0.01f)
4145                                         continue;
4146                         }
4147
4148                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4149                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4150                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4151                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4152                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4153                         {
4154                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4155                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4156                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4157                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4158                         }
4159                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4160                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4161                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4162                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4163                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4164                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4165                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4166                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4167
4168                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4169                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4170                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4171                         DPSOFTRAST_Vector3Normalize(lightnormal);
4172
4173                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4174
4175                         if(thread->shader_exactspecularmath)
4176                         {
4177                                 // reflect lightnormal at surfacenormal, take the negative of that
4178                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4179                                 float f;
4180                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4181                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4182                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4183                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4184
4185                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4186                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4187                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4188                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4189                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4190
4191                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4192                         }
4193                         else
4194                         {
4195                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4196                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4197                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4198                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4199
4200                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4201                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4202                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4203                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4204
4205                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4206                         }
4207                         specular = pow(specular, SpecularPower * glosstex[3]);
4208
4209                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4210                         {
4211                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4212                                 attenuation *= (1.0f / 255.0f);
4213                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4214                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4215                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4216                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4217                         }
4218                         else
4219                         {
4220                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4221                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4222                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4223                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4224                         }
4225                         buffer_FragColorbgra8[x*4+0] = d[0];
4226                         buffer_FragColorbgra8[x*4+1] = d[1];
4227                         buffer_FragColorbgra8[x*4+2] = d[2];
4228                         buffer_FragColorbgra8[x*4+3] = d[3];
4229                 }
4230         }
4231         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4232         {
4233                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4234                 for (x = startx;x < endx;x++)
4235                 {
4236                         z = buffer_z[x];
4237                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4238                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4239                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4240                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4241                         if (attenuation < 0.01f)
4242                                 continue;
4243                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4244                         {
4245                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4246                                 if (attenuation < 0.01f)
4247                                         continue;
4248                         }
4249
4250                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4251                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4252                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4253                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4254                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4255                         {
4256                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4257                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4258                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4259                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4260                         }
4261                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4262                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4263                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4264                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4265
4266                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4267                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4268                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4269                         DPSOFTRAST_Vector3Normalize(lightnormal);
4270
4271                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4272                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4273                         {
4274                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4275                                 attenuation *= (1.0f / 255.0f);
4276                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4277                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4278                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4279                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4280                         }
4281                         else
4282                         {
4283                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4284                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4285                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4286                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4287                         }
4288                         buffer_FragColorbgra8[x*4+0] = d[0];
4289                         buffer_FragColorbgra8[x*4+1] = d[1];
4290                         buffer_FragColorbgra8[x*4+2] = d[2];
4291                         buffer_FragColorbgra8[x*4+3] = d[3];
4292                 }
4293         }
4294         else
4295         {
4296                 for (x = startx;x < endx;x++)
4297                 {
4298                         z = buffer_z[x];
4299                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4300                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4301                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4302                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4303                         if (attenuation < 0.01f)
4304                                 continue;
4305                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4306                         {
4307                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4308                                 if (attenuation < 0.01f)
4309                                         continue;
4310                         }
4311
4312                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4313                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4314                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4315                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4316                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4317                         {
4318                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4319                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4320                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4321                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4322                         }
4323                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4324                         {
4325                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4326                                 attenuation *= (1.0f / 255.0f);
4327                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4328                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4329                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4330                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4331                         }
4332                         else
4333                         {
4334                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4335                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4336                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4337                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4338                         }
4339                         buffer_FragColorbgra8[x*4+0] = d[0];
4340                         buffer_FragColorbgra8[x*4+1] = d[1];
4341                         buffer_FragColorbgra8[x*4+2] = d[2];
4342                         buffer_FragColorbgra8[x*4+3] = d[3];
4343                 }
4344         }
4345         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4346 #endif
4347 }
4348
4349
4350
4351 void DPSOFTRAST_VertexShader_Refraction(void)
4352 {
4353         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4354 }
4355
4356 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4357 {
4358         // TODO: IMPLEMENT
4359         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4360         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4361         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4362         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4363         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4364 }
4365
4366
4367
4368 void DPSOFTRAST_VertexShader_Water(void)
4369 {
4370         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4371 }
4372
4373
4374 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4375 {
4376         // TODO: IMPLEMENT
4377         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4378         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4379         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4380         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4381         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4382 }
4383
4384
4385
4386 void DPSOFTRAST_VertexShader_ShowDepth(void)
4387 {
4388         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4389 }
4390
4391 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4392 {
4393         // TODO: IMPLEMENT
4394         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4395         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4396         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4397         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4398         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4399 }
4400
4401
4402
4403 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4404 {
4405         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4406 }
4407
4408 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4409 {
4410         // TODO: IMPLEMENT
4411         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4412         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4413         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4414         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4415         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4416 }
4417
4418
4419
4420 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4421 {
4422         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4423 }
4424
4425 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4426 {
4427         // TODO: IMPLEMENT
4428         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4429         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4430         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4431         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4432         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4433 }
4434
4435
4436
4437 typedef struct DPSOFTRAST_ShaderModeInfo_s
4438 {
4439         int lodarrayindex;
4440         void (*Vertex)(void);
4441         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4442         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4443         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4444 }
4445 DPSOFTRAST_ShaderModeInfo;
4446
4447 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4448 {
4449         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4450         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4451         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4452         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4453         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4454         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4455         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4456         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4457         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4458         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4459         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4460         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4461         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4462         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4463         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4464         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4465 };
4466
4467 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4468 {
4469         int i;
4470         int x;
4471         int startx;
4472         int endx;
4473 //      unsigned int c;
4474 //      unsigned int *colorpixel;
4475         unsigned int *depthpixel;
4476         float w;
4477         float wslope;
4478         int depth;
4479         int depthslope;
4480         unsigned int d;
4481         DPSOFTRAST_State_Triangle *triangle;
4482         DPSOFTRAST_State_Span *span;
4483         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4484         for (i = 0; i < thread->numspans; i++)
4485         {
4486                 span = &thread->spans[i];
4487                 triangle = &thread->triangles[span->triangle];
4488                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4489                 {
4490                         wslope = triangle->w[0];
4491                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4492                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4493                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4494                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4495                         startx = span->startx;
4496                         endx = span->endx;
4497                         switch(thread->fb_depthfunc)
4498                         {
4499                         default:
4500                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4501                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4502                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4503                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4504                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4505                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4506                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4507                         }
4508                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4509                         //for (x = startx;x < endx;x++)
4510                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4511                         // if there is no color buffer, skip pixel shader
4512                         while (startx < endx && !pixelmask[startx])
4513                                 startx++;
4514                         while (endx > startx && !pixelmask[endx-1])
4515                                 endx--;
4516                         if (startx >= endx)
4517                                 continue; // no pixels to fill
4518                         span->pixelmask = pixelmask;
4519                         span->startx = startx;
4520                         span->endx = endx;
4521                         // run pixel shader if appropriate
4522                         // do this before running depthmask code, to allow the pixelshader
4523                         // to clear pixelmask values for alpha testing
4524                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4525                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4526                         if (thread->depthmask)
4527                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4528                                         if (pixelmask[x])
4529                                                 depthpixel[x] = d;
4530                 }
4531                 else
4532                 {
4533                         // no depth testing means we're just dealing with color...
4534                         // if there is no color buffer, skip pixel shader
4535                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4536                         {
4537                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4538                                 span->pixelmask = pixelmask;
4539                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4540                         }
4541                 }
4542         }
4543         thread->numspans = 0;
4544 }
4545
4546 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4547
4548 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4549 {
4550 #ifdef SSE2_PRESENT
4551         int cullface = thread->cullface;
4552         int minx, maxx, miny, maxy;
4553         int miny1, maxy1, miny2, maxy2;
4554         __m128i fbmin, fbmax;
4555         __m128 viewportcenter, viewportscale;
4556         int firstvertex = command->firstvertex;
4557         int numvertices = command->numvertices;
4558         int numtriangles = command->numtriangles;
4559         const int *element3i = command->element3i;
4560         const unsigned short *element3s = command->element3s;
4561         int clipped = command->clipped;
4562         int i;
4563         int j;
4564         int k;
4565         int y;
4566         int e[3];
4567         __m128i screeny;
4568         int starty, endy, bandy;
4569         int numpoints;
4570         int clipcase;
4571         float clipdist[4];
4572         __m128 triangleedge1, triangleedge2, trianglenormal;
4573         __m128 clipfrac[3];
4574         __m128 screen[4];
4575         DPSOFTRAST_State_Triangle *triangle;
4576         DPSOFTRAST_Texture *texture;
4577         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4578         miny = thread->fb_scissor[1];
4579         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4580         miny1 = bound(miny, thread->miny1, maxy);
4581         maxy1 = bound(miny, thread->maxy1, maxy);
4582         miny2 = bound(miny, thread->miny2, maxy);
4583         maxy2 = bound(miny, thread->maxy2, maxy);
4584         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4585         {
4586                 if (!ATOMIC_DECREMENT(command->refcount))
4587                 {
4588                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4589                                 MM_FREE(command->arrays);
4590                 }
4591                 return;
4592         }
4593         minx = thread->fb_scissor[0];
4594         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4595         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4596         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4597         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4598         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4599         screen[3] = _mm_setzero_ps();
4600         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4601         for (i = 0;i < numtriangles;i++)
4602         {
4603                 const float *screencoord4f = command->arrays;
4604                 const float *arrays = screencoord4f + numvertices*4;
4605
4606                 // generate the 3 edges of this triangle
4607                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4608                 if (element3s)
4609                 {
4610                         e[0] = element3s[i*3+0] - firstvertex;
4611                         e[1] = element3s[i*3+1] - firstvertex;
4612                         e[2] = element3s[i*3+2] - firstvertex;
4613                 }
4614                 else if (element3i)
4615                 {
4616                         e[0] = element3i[i*3+0] - firstvertex;
4617                         e[1] = element3i[i*3+1] - firstvertex;
4618                         e[2] = element3i[i*3+2] - firstvertex;
4619                 }
4620                 else
4621                 {
4622                         e[0] = i*3+0;
4623                         e[1] = i*3+1;
4624                         e[2] = i*3+2;
4625                 }
4626
4627 #define SKIPBACKFACE \
4628                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4629                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4630                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4631                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4632                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4633                 switch(cullface) \
4634                 { \
4635                 case GL_BACK: \
4636                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4637                                 continue; \
4638                         break; \
4639                 case GL_FRONT: \
4640                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4641                                 continue; \
4642                         break; \
4643                 }
4644
4645 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4646                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4647                         { \
4648                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4649                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4650                         }
4651 #define CLIPPEDVERTEXCOPY(k,p1) \
4652                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4653
4654 #define GENATTRIBCOPY(attrib, p1) \
4655                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4656 #define GENATTRIBLERP(attrib, p1, p2) \
4657                 { \
4658                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4659                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4660                 }
4661 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4662                 switch(clipcase) \
4663                 { \
4664                 default: \
4665                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4666                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4667                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4668                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4669                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4670                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4671                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4672                 }
4673
4674                 if (! clipped)
4675                         goto notclipped;
4676
4677                 // calculate distance from nearplane
4678                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4679                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4680                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4681                 if (clipdist[0] >= 0.0f)
4682                 {
4683                         if (clipdist[1] >= 0.0f)
4684                         {
4685                                 if (clipdist[2] >= 0.0f)
4686                                 {
4687                                 notclipped:
4688                                         // triangle is entirely in front of nearplane
4689                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4690                                         SKIPBACKFACE;
4691                                         numpoints = 3;
4692                                         clipcase = 0;
4693                                 }
4694                                 else
4695                                 {
4696                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4697                                         SKIPBACKFACE;
4698                                         numpoints = 4;
4699                                         clipcase = 1;
4700                                 }
4701                         }
4702                         else
4703                         {
4704                                 if (clipdist[2] >= 0.0f)
4705                                 {
4706                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4707                                         SKIPBACKFACE;
4708                                         numpoints = 4;
4709                                         clipcase = 2;
4710                                 }
4711                                 else
4712                                 {
4713                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4714                                         SKIPBACKFACE;
4715                                         numpoints = 3;
4716                                         clipcase = 3;
4717                                 }
4718                         }
4719                 }
4720                 else if (clipdist[1] >= 0.0f)
4721                 {
4722                         if (clipdist[2] >= 0.0f)
4723                         {
4724                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4725                                 SKIPBACKFACE;
4726                                 numpoints = 4;
4727                                 clipcase = 4;
4728                         }
4729                         else
4730                         {
4731                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4732                                 SKIPBACKFACE;
4733                                 numpoints = 3;
4734                                 clipcase = 5;
4735                         }
4736                 }
4737                 else if (clipdist[2] >= 0.0f)
4738                 {
4739                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4740                         SKIPBACKFACE;
4741                         numpoints = 3;
4742                         clipcase = 6;
4743                 }
4744                 else continue; // triangle is entirely behind nearplane
4745
4746                 {
4747                         // calculate integer y coords for triangle points
4748                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4749                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4750                                         screenmin = _mm_min_epi16(screeni, screenir),
4751                                         screenmax = _mm_max_epi16(screeni, screenir);
4752                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4753                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4754                         screenmin = _mm_max_epi16(screenmin, fbmin);
4755                         screenmax = _mm_min_epi16(screenmax, fbmax);
4756                         // skip offscreen triangles
4757                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4758                                 continue;
4759                         starty = _mm_extract_epi16(screenmin, 1);
4760                         endy = _mm_extract_epi16(screenmax, 1)+1;
4761                         if (starty >= maxy1 && endy <= miny2)
4762                                 continue;
4763                         screeny = _mm_srai_epi32(screeni, 16);
4764                 }
4765
4766                 triangle = &thread->triangles[thread->numtriangles];
4767
4768                 // calculate attribute plans for triangle data...
4769                 // okay, this triangle is going to produce spans, we'd better project
4770                 // the interpolants now (this is what gives perspective texturing),
4771                 // this consists of simply multiplying all arrays by the W coord
4772                 // (which is basically 1/Z), which will be undone per-pixel
4773                 // (multiplying by Z again) to get the perspective-correct array
4774                 // values
4775                 {
4776                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4777                         __m128 mipedgescale, mipdensity;
4778                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4779                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4780                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4781                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4782                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4783                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4784                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4785                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4786                         attribedge1 = _mm_sub_ss(w0, w1);
4787                         attribedge2 = _mm_sub_ss(w2, w1);
4788                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4789                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4790                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4791                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4792                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4793                         _mm_store_ss(&triangle->w[0], attribxslope);
4794                         _mm_store_ss(&triangle->w[1], attribyslope);
4795                         _mm_store_ss(&triangle->w[2], attriborigin);
4796                         mipedgescale = _mm_setzero_ps();
4797                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4798                         {
4799                                 __m128 attrib0, attrib1, attrib2;
4800                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4801                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4802                                         break;
4803                                 arrays += numvertices*4;
4804                                 GENATTRIBS(attrib0, attrib1, attrib2);
4805                                 attriborigin = _mm_mul_ps(attrib1, w1);
4806                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4807                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4808                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4809                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4810                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4811                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4812                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4813                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4814                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4815                                 {
4816                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4817                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4818                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4819                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4820                                 }
4821                         }
4822
4823                         memset(triangle->mip, 0, sizeof(triangle->mip));
4824                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4825                         {
4826                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4827                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4828                                         break;
4829                                 texture = thread->texbound[texunit];
4830                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4831                                 {
4832                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4833                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4834                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4835                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4836                                         // this will be multiplied in the texturing routine by the texture resolution
4837                                         y = _mm_cvtss_si32(mipdensity);
4838                                         if (y > 0)
4839                                         {
4840                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4841                                                 if (y > texture->mipmaps - 1)
4842                                                         y = texture->mipmaps - 1;
4843                                                 triangle->mip[texunit] = y;
4844                                         }
4845                                 }
4846                         }
4847                 }
4848         
4849                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4850                 for (; y < bandy;)
4851                 {
4852                         __m128 xcoords, xslope;
4853                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4854                         int yccmask = _mm_movemask_epi8(ycc);
4855                         int edge0p, edge0n, edge1p, edge1n;
4856                         int nexty;
4857                         if (numpoints == 4)
4858                         {
4859                                 switch(yccmask)
4860                                 {
4861                                 default:
4862                                 case 0xFFFF: /*0000*/ y = endy; continue;
4863                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4864                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4865                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4866                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4867                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4868                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4869                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4870                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4871                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4872                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4873                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4874                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4875                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4876                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4877                                 case 0x0000: /*1111*/ y++; continue;
4878                                 }
4879                         }
4880                         else
4881                         {
4882                                 switch(yccmask)
4883                                 {
4884                                 default:
4885                                 case 0xFFFF: /*000*/ y = endy; continue;
4886                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4887                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4888                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4889                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4890                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4891                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4892                                 case 0x0000: /*111*/ y++; continue;
4893                                 }
4894                         }
4895                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4896                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4897                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4898                         nexty = _mm_extract_epi16(ycc, 0);
4899                         if (nexty >= bandy) nexty = bandy-1;
4900                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4901                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4902                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4903                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4904                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4905                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
4906                         {
4907                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
4908                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
4909                         }
4910                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4911                         {
4912                                 int startx, endx, offset;
4913                                 startx = _mm_cvtss_si32(xcoords);
4914                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4915                                 if (startx < minx) 
4916                                 {
4917                                         if (startx < 0) startx = 0;
4918                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4919                                 }
4920                                 if (endx > maxx) endx = maxx;
4921                                 if (startx >= endx) continue;
4922                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4923                                 {
4924                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4925                                         span->triangle = thread->numtriangles;
4926                                         span->x = offset;
4927                                         span->y = y;
4928                                         span->startx = max(minx - offset, 0);
4929                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4930                                         if (span->startx >= span->endx)
4931                                                 continue; 
4932                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4933                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4934                                 }
4935                         }
4936                 }
4937
4938                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4939                 {
4940                         DPSOFTRAST_Draw_ProcessSpans(thread);
4941                         thread->numtriangles = 0;
4942                 }
4943         }
4944
4945         if (!ATOMIC_DECREMENT(command->refcount))
4946         {
4947                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4948                         MM_FREE(command->arrays);
4949         }
4950
4951         if (thread->numspans > 0 || thread->numtriangles > 0)
4952         {
4953                 DPSOFTRAST_Draw_ProcessSpans(thread);
4954                 thread->numtriangles = 0;
4955         }
4956 #endif
4957 }
4958
4959 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4960 {
4961         int i;
4962         int j;
4963         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4964         int datasize = 2*numvertices*sizeof(float[4]);
4965         DPSOFTRAST_Command_Draw *command;
4966         unsigned char *data;
4967         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4968         {
4969                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4970                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4971                         break;
4972                 datasize += numvertices*sizeof(float[4]);
4973         }
4974         if (element3s)
4975                 datasize += numtriangles*sizeof(unsigned short[3]);
4976         else if (element3i)
4977                 datasize += numtriangles*sizeof(int[3]);
4978         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4979         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4980         {
4981                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4982                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4983         }
4984         else
4985         {
4986                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4987                 data = (unsigned char *)command + commandsize;
4988         }
4989         command->firstvertex = firstvertex;
4990         command->numvertices = numvertices;
4991         command->numtriangles = numtriangles;
4992         command->arrays = (float *)data;
4993         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4994         dpsoftrast.firstvertex = firstvertex;
4995         dpsoftrast.numvertices = numvertices;
4996         dpsoftrast.screencoord4f = (float *)data;
4997         data += numvertices*sizeof(float[4]);
4998         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4999         data += numvertices*sizeof(float[4]);
5000         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5001         {
5002                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5003                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5004                         break;
5005                 dpsoftrast.post_array4f[j] = (float *)data;
5006                 data += numvertices*sizeof(float[4]);
5007         }
5008         command->element3i = NULL;
5009         command->element3s = NULL;
5010         if (element3s)
5011         {
5012                 command->element3s = (unsigned short *)data;
5013                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5014         }
5015         else if (element3i)
5016         {
5017                 command->element3i = (int *)data;
5018                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5019         }
5020         return command;
5021 }
5022
5023 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5024 {
5025         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5026         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5027         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5028         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5029         if (command->starty >= command->endy)
5030         {
5031                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5032                         MM_FREE(command->arrays);
5033                 DPSOFTRAST_UndoCommand(command->commandsize);
5034                 return;
5035         }
5036         command->clipped = dpsoftrast.drawclipped;
5037         command->refcount = dpsoftrast.numthreads;
5038
5039         if (dpsoftrast.usethreads)
5040         {
5041                 int i;
5042                 DPSOFTRAST_Draw_SyncCommands();
5043                 for (i = 0; i < dpsoftrast.numthreads; i++)
5044                 {
5045                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5046                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5047                                 Thread_CondSignal(thread->drawcond);
5048                 }
5049         }
5050         else
5051         {
5052                 DPSOFTRAST_Draw_FlushThreads();
5053         }
5054 }
5055  
5056 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5057 {
5058         int commandoffset = thread->commandoffset;
5059         while (commandoffset != endoffset)
5060         {
5061                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5062                 switch (command->opcode)
5063                 {
5064 #define INTERPCOMMAND(name) \
5065                 case DPSOFTRAST_OPCODE_##name : \
5066                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5067                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5068                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5069                                 commandoffset = 0; \
5070                         break;
5071                 INTERPCOMMAND(Viewport)
5072                 INTERPCOMMAND(ClearColor)
5073                 INTERPCOMMAND(ClearDepth)
5074                 INTERPCOMMAND(ColorMask)
5075                 INTERPCOMMAND(DepthTest)
5076                 INTERPCOMMAND(ScissorTest)
5077                 INTERPCOMMAND(Scissor)
5078                 INTERPCOMMAND(BlendFunc)
5079                 INTERPCOMMAND(BlendSubtract)
5080                 INTERPCOMMAND(DepthMask)
5081                 INTERPCOMMAND(DepthFunc)
5082                 INTERPCOMMAND(DepthRange)
5083                 INTERPCOMMAND(PolygonOffset)
5084                 INTERPCOMMAND(CullFace)
5085                 INTERPCOMMAND(AlphaTest)
5086                 INTERPCOMMAND(AlphaFunc)
5087                 INTERPCOMMAND(SetTexture)
5088                 INTERPCOMMAND(SetShader)
5089                 INTERPCOMMAND(Uniform4f)
5090                 INTERPCOMMAND(UniformMatrix4f)
5091                 INTERPCOMMAND(Uniform1i)
5092
5093                 case DPSOFTRAST_OPCODE_Draw:
5094                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5095                         commandoffset += command->commandsize;
5096                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5097                                 commandoffset = 0;
5098                         thread->commandoffset = commandoffset;
5099                         break;
5100
5101                 case DPSOFTRAST_OPCODE_Reset:
5102                         commandoffset = 0;
5103                         break;
5104                 }
5105         }
5106         thread->commandoffset = commandoffset;
5107 }
5108
5109 static int DPSOFTRAST_Draw_Thread(void *data)
5110 {
5111         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5112         while(thread->index >= 0)
5113         {
5114                 if (thread->commandoffset != dpsoftrast.drawcommand)
5115                 {
5116                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5117                 }
5118                 else 
5119                 {
5120                         Thread_LockMutex(thread->drawmutex);
5121                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5122                         {
5123                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5124                                 thread->starving = true;
5125                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5126                                 thread->starving = false;
5127                         }
5128                         Thread_UnlockMutex(thread->drawmutex);
5129                 }
5130         }   
5131         return 0;
5132 }
5133
5134 static void DPSOFTRAST_Draw_FlushThreads(void)
5135 {
5136         DPSOFTRAST_State_Thread *thread;
5137         int i;
5138         DPSOFTRAST_Draw_SyncCommands();
5139         if (dpsoftrast.usethreads) 
5140         {
5141                 for (i = 0; i < dpsoftrast.numthreads; i++)
5142                 {
5143                         thread = &dpsoftrast.threads[i];
5144                         if (thread->commandoffset != dpsoftrast.drawcommand)
5145                         {
5146                                 Thread_LockMutex(thread->drawmutex);
5147                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5148                                         Thread_CondSignal(thread->drawcond);
5149                                 Thread_UnlockMutex(thread->drawmutex);
5150                         }
5151                 }
5152                 for (i = 0; i < dpsoftrast.numthreads; i++)
5153                 {
5154                         thread = &dpsoftrast.threads[i];
5155                         if (thread->commandoffset != dpsoftrast.drawcommand)
5156                         {
5157                                 Thread_LockMutex(thread->drawmutex);
5158                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5159                                 {
5160                                         thread->waiting = true;
5161                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5162                                         thread->waiting = false;
5163                                 }
5164                                 Thread_UnlockMutex(thread->drawmutex);
5165                         }
5166                 }
5167         }
5168         else
5169         {
5170                 for (i = 0; i < dpsoftrast.numthreads; i++)
5171                 {
5172                         thread = &dpsoftrast.threads[i];
5173                         if (thread->commandoffset != dpsoftrast.drawcommand)
5174                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5175                 }
5176         }
5177         dpsoftrast.commandpool.usedcommands = 0;
5178 }
5179
5180 void DPSOFTRAST_Flush(void)
5181 {
5182         DPSOFTRAST_Draw_FlushThreads();
5183 }
5184
5185 void DPSOFTRAST_Finish(void)
5186 {
5187         DPSOFTRAST_Flush();
5188 }
5189
5190 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5191 {
5192         int i;
5193         union
5194         {
5195                 int i;
5196                 unsigned char b[4];
5197         }
5198         u;
5199         u.i = 1;
5200         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5201         dpsoftrast.bigendian = u.b[3];
5202         dpsoftrast.fb_width = width;
5203         dpsoftrast.fb_height = height;
5204         dpsoftrast.fb_depthpixels = depthpixels;
5205         dpsoftrast.fb_colorpixels[0] = colorpixels;
5206         dpsoftrast.fb_colorpixels[1] = NULL;
5207         dpsoftrast.fb_colorpixels[1] = NULL;
5208         dpsoftrast.fb_colorpixels[1] = NULL;
5209         dpsoftrast.viewport[0] = 0;
5210         dpsoftrast.viewport[1] = 0;
5211         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5212         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5213         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5214         dpsoftrast.texture_firstfree = 1;
5215         dpsoftrast.texture_end = 1;
5216         dpsoftrast.texture_max = 0;
5217         dpsoftrast.color[0] = 1;
5218         dpsoftrast.color[1] = 1;
5219         dpsoftrast.color[2] = 1;
5220         dpsoftrast.color[3] = 1;
5221         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5222         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5223         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5224         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5225         for (i = 0; i < dpsoftrast.numthreads; i++)
5226         {
5227                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5228                 thread->index = i;
5229                 thread->cullface = GL_BACK;
5230                 thread->colormask[1] = 1;
5231                 thread->colormask[2] = 1;
5232                 thread->colormask[3] = 1;
5233                 thread->blendfunc[0] = GL_ONE;
5234                 thread->blendfunc[1] = GL_ZERO;
5235                 thread->depthmask = true;
5236                 thread->depthtest = true;
5237                 thread->depthfunc = GL_LEQUAL;
5238                 thread->scissortest = false;
5239                 thread->alphatest = false;
5240                 thread->alphafunc = GL_GREATER;
5241                 thread->alphavalue = 0.5f;
5242                 thread->viewport[0] = 0;
5243                 thread->viewport[1] = 0;
5244                 thread->viewport[2] = dpsoftrast.fb_width;
5245                 thread->viewport[3] = dpsoftrast.fb_height;
5246                 thread->scissor[0] = 0;
5247                 thread->scissor[1] = 0;
5248                 thread->scissor[2] = dpsoftrast.fb_width;
5249                 thread->scissor[3] = dpsoftrast.fb_height;
5250                 thread->depthrange[0] = 0;
5251                 thread->depthrange[1] = 1;
5252                 thread->polygonoffset[0] = 0;
5253                 thread->polygonoffset[1] = 0;
5254         
5255                 if (dpsoftrast.interlace)
5256                 {
5257                         thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5258                         thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5259                         thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5260                         thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5261                 }
5262                 else
5263                 {
5264                         thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5265                         thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5266                 }
5267
5268                 thread->numspans = 0;
5269                 thread->numtriangles = 0;
5270                 thread->commandoffset = 0;
5271                 thread->waiting = false;
5272                 thread->starving = false;
5273            
5274                 thread->validate = -1;
5275                 DPSOFTRAST_Validate(thread, -1);
5276  
5277                 if (dpsoftrast.usethreads)
5278                 {
5279                         thread->waitcond = Thread_CreateCond();
5280                         thread->drawcond = Thread_CreateCond();
5281                         thread->drawmutex = Thread_CreateMutex();
5282                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5283                 }
5284         }
5285         return 0;
5286 }
5287
5288 void DPSOFTRAST_Shutdown(void)
5289 {
5290         int i;
5291         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5292         {
5293                 DPSOFTRAST_State_Thread *thread;
5294                 for (i = 0; i < dpsoftrast.numthreads; i++)
5295                 {
5296                         thread = &dpsoftrast.threads[i];
5297                         Thread_LockMutex(thread->drawmutex);
5298                         thread->index = -1;
5299                         Thread_CondSignal(thread->drawcond);
5300                         Thread_UnlockMutex(thread->drawmutex);
5301                         Thread_WaitThread(thread->thread, 0);
5302                         Thread_DestroyCond(thread->waitcond);
5303                         Thread_DestroyCond(thread->drawcond);
5304                         Thread_DestroyMutex(thread->drawmutex);
5305                 }
5306         }
5307         for (i = 0;i < dpsoftrast.texture_end;i++)
5308                 if (dpsoftrast.texture[i].bytes)
5309                         MM_FREE(dpsoftrast.texture[i].bytes);
5310         if (dpsoftrast.texture)
5311                 free(dpsoftrast.texture);
5312         if (dpsoftrast.threads)
5313                 MM_FREE(dpsoftrast.threads);
5314         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5315 }
5316