]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
remove errant debugging code... again
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "dpsoftrast.h"
7
8 #ifdef USE_SDL
9 #define USE_THREADS
10 #endif
11
12 #ifndef __cplusplus
13 typedef qboolean bool;
14 #endif
15
16 #define ALIGN_SIZE 16
17 #define ATOMIC_SIZE 32
18
19 #ifdef SSE2_PRESENT
20         #if defined(__GNUC__)
21                 #define ALIGN(var) var __attribute__((__aligned__(16)))
22                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
23                 #ifdef USE_THREADS
24                         #define MEMORY_BARRIER (_mm_sfence())
25                         //(__sync_synchronize())
26                         #define ATOMIC_COUNTER volatile int
27                         #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
28                         #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
29                         #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
30                 #endif
31         #elif defined(_MSC_VER)
32                 #define ALIGN(var) __declspec(align(16)) var
33                 #define ATOMIC(var) __declspec(align(32)) var
34                 #ifdef USE_THREADS
35                         #define MEMORY_BARRIER (_mm_sfence())
36                         //(MemoryBarrier())
37                         #define ATOMIC_COUNTER volatile LONG
38                         #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
39                         #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
40                         #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
41                 #endif
42         #else
43                 #undef USE_THREADS
44                 #undef SSE2_PRESENT
45         #endif
46 #endif
47
48 #ifndef SSE2_PRESENT
49         #define ALIGN(var) var
50         #define ATOMIC(var) var
51 #endif
52
53 #ifdef USE_THREADS
54 #include <SDL.h>
55 #include <SDL_thread.h>
56 #else
57         #define MEMORY_BARRIER ((void)0)
58         #define ATOMIC_COUNTER int
59         #define ATOMIC_INCREMENT(counter) (++(counter))
60         #define ATOMIC_DECREMENT(counter) (--(counter))
61         #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
62         typedef void SDL_Thread;
63         typedef void SDL_cond;
64         typedef void SDL_mutex;
65 #endif
66
67 #ifdef SSE2_PRESENT
68 #include <emmintrin.h>
69
70 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
71
72 static void *MM_CALLOC(size_t nmemb, size_t size)
73 {
74         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
75         if(ptr != NULL) memset(ptr, 0, nmemb*size);
76         return ptr;
77 }
78
79 #define MM_FREE _mm_free
80 #else
81 #define MM_MALLOC(size) malloc(size)
82 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
83 #define MM_FREE free
84 #endif
85
86 typedef enum DPSOFTRAST_ARRAY_e
87 {
88         DPSOFTRAST_ARRAY_POSITION,
89         DPSOFTRAST_ARRAY_COLOR,
90         DPSOFTRAST_ARRAY_TEXCOORD0,
91         DPSOFTRAST_ARRAY_TEXCOORD1,
92         DPSOFTRAST_ARRAY_TEXCOORD2,
93         DPSOFTRAST_ARRAY_TEXCOORD3,
94         DPSOFTRAST_ARRAY_TEXCOORD4,
95         DPSOFTRAST_ARRAY_TEXCOORD5,
96         DPSOFTRAST_ARRAY_TEXCOORD6,
97         DPSOFTRAST_ARRAY_TEXCOORD7,
98         DPSOFTRAST_ARRAY_TOTAL
99 }
100 DPSOFTRAST_ARRAY;
101
102 typedef struct DPSOFTRAST_Texture_s
103 {
104         int flags;
105         int width;
106         int height;
107         int depth;
108         int sides;
109         DPSOFTRAST_TEXTURE_FILTER filter;
110         int mipmaps;
111         int size;
112         ATOMIC_COUNTER binds;
113         unsigned char *bytes;
114         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
115 }
116 DPSOFTRAST_Texture;
117
118 #define COMMAND_SIZE ALIGN_SIZE
119 #define COMMAND_ALIGN(var) ALIGN(var)
120
121 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
122 {
123         unsigned char opcode;
124         unsigned short commandsize;
125 }
126 DPSOFTRAST_Command);
127
128 enum { DPSOFTRAST_OPCODE_Reset = 0 };
129
130 #define DEFCOMMAND(opcodeval, name, fields) \
131         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
132         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
133         { \
134                 unsigned char opcode; \
135                 unsigned short commandsize; \
136                 fields \
137         } DPSOFTRAST_Command_##name );
138
139 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
140 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
141
142 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
143 {
144         int freecommand;
145         int usedcommands;
146         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
147 }
148 DPSOFTRAST_State_Command_Pool);
149
150 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
151 {
152         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
153         float w[3];
154         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
155 }
156 DPSOFTRAST_State_Triangle);
157
158 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
159         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
160         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
161                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
162                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
163 }
164 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
165         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
166         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
167         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
168         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
169         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
170         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
171         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
172         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
173 }
174                                         
175 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
176
177 typedef ALIGN(struct DPSOFTRAST_State_Span_s
178 {
179         int triangle; // triangle this span was generated by
180         int x; // framebuffer x coord
181         int y; // framebuffer y coord
182         int length; // pixel count
183         int startx; // usable range (according to pixelmask)
184         int endx; // usable range (according to pixelmask)
185         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
186 }
187 DPSOFTRAST_State_Span);
188
189 #define DPSOFTRAST_DRAW_MAXSPANS 1024
190 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
191
192 #define DPSOFTRAST_VALIDATE_FB 1
193 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
194 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
195 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
196
197 typedef enum DPSOFTRAST_BLENDMODE_e
198 {
199         DPSOFTRAST_BLENDMODE_OPAQUE,
200         DPSOFTRAST_BLENDMODE_ALPHA,
201         DPSOFTRAST_BLENDMODE_ADDALPHA,
202         DPSOFTRAST_BLENDMODE_ADD,
203         DPSOFTRAST_BLENDMODE_INVMOD,
204         DPSOFTRAST_BLENDMODE_MUL,
205         DPSOFTRAST_BLENDMODE_MUL2,
206         DPSOFTRAST_BLENDMODE_SUBALPHA,
207         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
208         DPSOFTRAST_BLENDMODE_TOTAL
209 }
210 DPSOFTRAST_BLENDMODE;
211
212 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
213 {
214         SDL_Thread *thread;
215         int index;
216         
217         int cullface;
218         int colormask[4];
219         int blendfunc[2];
220         int blendsubtract;
221         int depthmask;
222         int depthtest;
223         int depthfunc;
224         int scissortest;
225         int alphatest;
226         int alphafunc;
227         float alphavalue;
228         int viewport[4];
229         int scissor[4];
230         float depthrange[2];
231         float polygonoffset[2];
232
233         int shader_mode;
234         int shader_permutation;
235
236         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
237         
238         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
239         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
240
241         // DPSOFTRAST_VALIDATE_ flags
242         int validate;
243
244         // derived values (DPSOFTRAST_VALIDATE_FB)
245         int fb_colormask;
246         int fb_clearscissor[4];
247         ALIGN(float fb_viewportcenter[4]);
248         ALIGN(float fb_viewportscale[4]);
249
250         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
251         int fb_depthfunc;
252
253         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
254         int fb_blendmode;
255
256         ATOMIC(volatile int commandoffset);
257
258         volatile bool waiting;
259         volatile bool starving;
260         SDL_cond *waitcond;
261         SDL_cond *drawcond;
262         SDL_mutex *drawmutex;
263
264         int numspans;
265         int numtriangles;
266         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
267         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
268 }
269 DPSOFTRAST_State_Thread);
270
271 typedef ATOMIC(struct DPSOFTRAST_State_s
272 {
273         int fb_width;
274         int fb_height;
275         unsigned int *fb_depthpixels;
276         unsigned int *fb_colorpixels[4];
277
278         int viewport[4];
279         ALIGN(float fb_viewportcenter[4]);
280         ALIGN(float fb_viewportscale[4]);
281
282         float color[4];
283         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
284         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
285
286         const float *pointer_vertex3f;
287         const float *pointer_color4f;
288         const unsigned char *pointer_color4ub;
289         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
290         int stride_vertex;
291         int stride_color;
292         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
293         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
294         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
295
296         int firstvertex;
297         int numvertices;
298         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
299         float *screencoord4f;
300         int drawstarty;
301         int drawendy;
302         int drawclipped;
303         
304         int shader_mode;
305         int shader_permutation;
306
307         int texture_max;
308         int texture_end;
309         int texture_firstfree;
310         DPSOFTRAST_Texture *texture;
311
312         int bigendian;
313
314         // error reporting
315         const char *errorstring;
316
317         int numthreads;
318         DPSOFTRAST_State_Thread *threads;
319
320         ATOMIC(volatile int drawcommand);
321
322         DPSOFTRAST_State_Command_Pool commandpool;
323 }
324 DPSOFTRAST_State);
325
326 DPSOFTRAST_State dpsoftrast;
327
328 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
329 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
330 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
331 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
332 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
333
334 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
335 {
336         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
337         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
338         fb_viewportcenter[3] = 0.5f;
339         fb_viewportcenter[0] = 0.0f;
340         fb_viewportscale[1] = 0.5f * viewport[2];
341         fb_viewportscale[2] = -0.5f * viewport[3];
342         fb_viewportscale[3] = 0.5f;
343         fb_viewportscale[0] = 1.0f;
344 }
345
346 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
347 {
348         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
349         // and viewport projection values
350         int x1, x2;
351         int y1, y2;
352         x1 = thread->scissor[0];
353         x2 = thread->scissor[0] + thread->scissor[2];
354         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
355         y2 = dpsoftrast.fb_height - thread->scissor[1];
356         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
357         if (x1 < 0) x1 = 0;
358         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
359         if (y1 < 0) y1 = 0;
360         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
361         thread->fb_clearscissor[0] = x1;
362         thread->fb_clearscissor[1] = y1;
363         thread->fb_clearscissor[2] = x2 - x1;
364         thread->fb_clearscissor[3] = y2 - y1;
365
366         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
367 }
368
369 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
370 {
371         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
372 }
373
374 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
375 {
376         if (thread->blendsubtract)
377         {
378                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
379                 {
380                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
381                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
382                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
383                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
384                 }
385         }
386         else
387         {       
388                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
389                 {
390                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
391                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
392                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
393                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
394                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
395                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
396                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
397                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
398                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
399                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
400                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
401                 }
402         }
403 }
404
405 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
406
407 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
408 {
409         mask &= thread->validate;
410         if (!mask)
411                 return;
412         if (mask & DPSOFTRAST_VALIDATE_FB)
413         {
414                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
415                 DPSOFTRAST_RecalcFB(thread);
416         }
417         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
418         {
419                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
420                 DPSOFTRAST_RecalcDepthFunc(thread);
421         }
422         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
423         {
424                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
425                 DPSOFTRAST_RecalcBlendFunc(thread);
426         }
427 }
428
429 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
430 {
431         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
432                 return &dpsoftrast.texture[index];
433         return NULL;
434 }
435
436 static void DPSOFTRAST_Texture_Grow(void)
437 {
438         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
439         DPSOFTRAST_State_Thread *thread;
440         int i;
441         int j;
442         DPSOFTRAST_Flush();
443         // expand texture array as needed
444         if (dpsoftrast.texture_max < 1024)
445                 dpsoftrast.texture_max = 1024;
446         else
447                 dpsoftrast.texture_max *= 2;
448         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
449         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
450                 if(dpsoftrast.texbound[i])
451                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
452         for (j = 0; j < dpsoftrast.numthreads; j++)
453         {
454                 thread = &dpsoftrast.threads[j];
455                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
456                         if(thread->texbound[i])
457                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
458         }
459 }
460
461 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
462 {
463         int w;
464         int h;
465         int d;
466         int size;
467         int s;
468         int texnum;
469         int mipmaps;
470         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
471         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
472         DPSOFTRAST_Texture *texture;
473         if (width*height*depth < 1)
474         {
475                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
476                 return 0;
477         }
478         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
479         {
480                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
481                 return 0;
482         }
483         switch(texformat)
484         {
485         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
486         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
487         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
488                 break;
489         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
490                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
491                 {
492                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
493                         return 0;
494                 }
495                 if (depth != 1)
496                 {
497                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
498                         return 0;
499                 }
500                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
501                 {
502                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
503                         return 0;
504                 }
505                 break;
506         }
507         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
508         {
509                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
510                 return 0;
511         }
512         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
513         {
514                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
515                 return 0;
516         }
517         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
518         {
519                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
520                 return 0;
521         }
522         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
523         {
524                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
525                 return 0;
526         }
527         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
528         {
529                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
530                 return 0;
531         }
532         // find first empty slot in texture array
533         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
534                 if (!dpsoftrast.texture[texnum].bytes)
535                         break;
536         dpsoftrast.texture_firstfree = texnum + 1;
537         if (dpsoftrast.texture_max <= texnum)
538                 DPSOFTRAST_Texture_Grow();
539         if (dpsoftrast.texture_end <= texnum)
540                 dpsoftrast.texture_end = texnum + 1;
541         texture = &dpsoftrast.texture[texnum];
542         memset(texture, 0, sizeof(*texture));
543         texture->flags = flags;
544         texture->width = width;
545         texture->height = height;
546         texture->depth = depth;
547         texture->sides = sides;
548         texture->binds = 0;
549         w = width;
550         h = height;
551         d = depth;
552         size = 0;
553         mipmaps = 0;
554         w = width;
555         h = height;
556         d = depth;
557         for (;;)
558         {
559                 s = w * h * d * sides * 4;
560                 texture->mipmap[mipmaps][0] = size;
561                 texture->mipmap[mipmaps][1] = s;
562                 texture->mipmap[mipmaps][2] = w;
563                 texture->mipmap[mipmaps][3] = h;
564                 texture->mipmap[mipmaps][4] = d;
565                 size += s;
566                 mipmaps++;
567                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
568                         break;
569                 if (w > 1) w >>= 1;
570                 if (h > 1) h >>= 1;
571                 if (d > 1) d >>= 1;
572         }
573         texture->mipmaps = mipmaps;
574         texture->size = size;
575
576         // allocate the pixels now
577         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
578
579         return texnum;
580 }
581 void DPSOFTRAST_Texture_Free(int index)
582 {
583         DPSOFTRAST_Texture *texture;
584         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
585         if (texture->binds)
586                 DPSOFTRAST_Flush();
587         if (texture->bytes)
588                 MM_FREE(texture->bytes);
589         texture->bytes = NULL;
590         memset(texture, 0, sizeof(*texture));
591         // adjust the free range and used range
592         if (dpsoftrast.texture_firstfree > index)
593                 dpsoftrast.texture_firstfree = index;
594         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
595                 dpsoftrast.texture_end--;
596 }
597 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
598 {
599         int i, x, y, z, w, layer0, layer1, row0, row1;
600         unsigned char *o, *i0, *i1, *i2, *i3;
601         DPSOFTRAST_Texture *texture;
602         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
603         if (texture->mipmaps <= 1)
604                 return;
605         for (i = 1;i < texture->mipmaps;i++)
606         {
607                 for (z = 0;z < texture->mipmap[i][4];z++)
608                 {
609                         layer0 = z*2;
610                         layer1 = z*2+1;
611                         if (layer1 >= texture->mipmap[i-1][4])
612                                 layer1 = texture->mipmap[i-1][4]-1;
613                         for (y = 0;y < texture->mipmap[i][3];y++)
614                         {
615                                 row0 = y*2;
616                                 row1 = y*2+1;
617                                 if (row1 >= texture->mipmap[i-1][3])
618                                         row1 = texture->mipmap[i-1][3]-1;
619                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
620                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
621                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
622                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
623                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
624                                 w = texture->mipmap[i][2];
625                                 if (layer1 > layer0)
626                                 {
627                                         if (texture->mipmap[i-1][2] > 1)
628                                         {
629                                                 // average 3D texture
630                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
631                                                 {
632                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
633                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
634                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
635                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
636                                                 }
637                                         }
638                                         else
639                                         {
640                                                 // average 3D mipmap with parent width == 1
641                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
642                                                 {
643                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
644                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
645                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
646                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
647                                                 }
648                                         }
649                                 }
650                                 else
651                                 {
652                                         if (texture->mipmap[i-1][2] > 1)
653                                         {
654                                                 // average 2D texture (common case)
655                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
656                                                 {
657                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
658                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
659                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
660                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
661                                                 }
662                                         }
663                                         else
664                                         {
665                                                 // 2D texture with parent width == 1
666                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
667                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
668                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
669                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
670                                         }
671                                 }
672                         }
673                 }
674         }
675 }
676 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
677 {
678         DPSOFTRAST_Texture *texture;
679         unsigned char *dst;
680         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
681         if (texture->binds)
682                 DPSOFTRAST_Flush();
683         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
684         while (blockheight > 0)
685         {
686                 memcpy(dst, pixels, blockwidth * 4);
687                 pixels += blockwidth * 4;
688                 dst += texture->mipmap[0][2] * 4;
689                 blockheight--;
690         }
691         DPSOFTRAST_Texture_CalculateMipmaps(index);
692 }
693 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
694 {
695         DPSOFTRAST_Texture *texture;
696         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
697         if (texture->binds)
698                 DPSOFTRAST_Flush();
699         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
700         DPSOFTRAST_Texture_CalculateMipmaps(index);
701 }
702 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
703 {
704         DPSOFTRAST_Texture *texture;
705         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
706         return texture->mipmap[mip][2];
707 }
708 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
709 {
710         DPSOFTRAST_Texture *texture;
711         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
712         return texture->mipmap[mip][3];
713 }
714 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
715 {
716         DPSOFTRAST_Texture *texture;
717         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
718         return texture->mipmap[mip][4];
719 }
720 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
721 {
722         DPSOFTRAST_Texture *texture;
723         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
724         if (texture->binds)
725                 DPSOFTRAST_Flush();
726         return texture->bytes + texture->mipmap[mip][0];
727 }
728 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
729 {
730         DPSOFTRAST_Texture *texture;
731         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
732         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
733         {
734                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
735                 return;
736         }
737         if (texture->binds)
738                 DPSOFTRAST_Flush();
739         texture->filter = filter;
740 }
741
742 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
743 {
744         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
745                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
746                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
747                 DPSOFTRAST_Flush();
748         dpsoftrast.fb_width = width;
749         dpsoftrast.fb_height = height;
750         dpsoftrast.fb_depthpixels = depthpixels;
751         dpsoftrast.fb_colorpixels[0] = colorpixels0;
752         dpsoftrast.fb_colorpixels[1] = colorpixels1;
753         dpsoftrast.fb_colorpixels[2] = colorpixels2;
754         dpsoftrast.fb_colorpixels[3] = colorpixels3;
755 }
756
757 static void DPSOFTRAST_Draw_FlushThreads(void);
758
759 static void DPSOFTRAST_Draw_SyncCommands(void)
760 {
761         MEMORY_BARRIER;
762         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
763 }
764
765 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
766 {
767 #ifdef USE_THREADS
768         DPSOFTRAST_State_Thread *thread;
769         int i;
770         int freecommand = dpsoftrast.commandpool.freecommand;
771         int usedcommands = dpsoftrast.commandpool.usedcommands;
772         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
773                 return;
774         DPSOFTRAST_Draw_SyncCommands();
775         for(;;)
776         {
777                 int waitindex = -1;
778                 int commandoffset;
779                 usedcommands = 0;
780                 for (i = 0; i < dpsoftrast.numthreads; i++)
781                 {
782                         thread = &dpsoftrast.threads[i]; 
783                         commandoffset = freecommand - thread->commandoffset;
784                         if (commandoffset < 0)
785                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
786                         if (commandoffset > usedcommands)
787                         {
788                                 waitindex = i;
789                                 usedcommands = commandoffset;
790                         }
791                 }
792                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
793                         break;
794                 thread = &dpsoftrast.threads[waitindex];
795                 SDL_LockMutex(thread->drawmutex);
796                 if (thread->commandoffset != dpsoftrast.drawcommand)
797                 {
798                         thread->waiting = true;
799                         if (thread->starving) SDL_CondSignal(thread->drawcond);
800                         SDL_CondWait(thread->waitcond, thread->drawmutex);
801                         thread->waiting = false;
802                 }
803                 SDL_UnlockMutex(thread->drawmutex);
804         }
805         dpsoftrast.commandpool.usedcommands = usedcommands;
806 #else
807         DPSOFTRAST_Draw_FlushThreads();
808 #endif
809 }
810
811 #define DPSOFTRAST_ALIGNCOMMAND(size) \
812         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
813 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
814         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
815
816 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
817 {
818         DPSOFTRAST_Command *command;
819         int freecommand = dpsoftrast.commandpool.freecommand;
820         int usedcommands = dpsoftrast.commandpool.usedcommands;
821         int extra = sizeof(DPSOFTRAST_Command);
822         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
823                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
824         if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
825         {
826                 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
827                 freecommand = dpsoftrast.commandpool.freecommand;
828                 usedcommands = dpsoftrast.commandpool.usedcommands;
829         }
830         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
831         {
832                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
833                 command->opcode = DPSOFTRAST_OPCODE_Reset;
834                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
835                 freecommand = 0;
836         }
837         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
838         command->opcode = opcode;
839         command->commandsize = size;
840         freecommand += size;
841         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
842                 freecommand = 0;
843         dpsoftrast.commandpool.freecommand = freecommand;
844         dpsoftrast.commandpool.usedcommands = usedcommands + size;
845         return command;
846 }
847
848 static void DPSOFTRAST_UndoCommand(int size)
849 {
850         int freecommand = dpsoftrast.commandpool.freecommand;
851         int usedcommands = dpsoftrast.commandpool.usedcommands;
852         freecommand -= size;
853         usedcommands -= size;
854         dpsoftrast.commandpool.freecommand = freecommand;
855         dpsoftrast.commandpool.usedcommands = usedcommands;
856 }
857                 
858 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
859 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
860 {
861         thread->viewport[0] = command->x;
862         thread->viewport[1] = command->y;
863         thread->viewport[2] = command->width;
864         thread->viewport[3] = command->height;
865         thread->validate |= DPSOFTRAST_VALIDATE_FB;
866 }
867 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
868 {
869         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
870         command->x = x;
871         command->y = y;
872         command->width = width;
873         command->height = height;
874
875         dpsoftrast.viewport[0] = x;
876         dpsoftrast.viewport[1] = y;
877         dpsoftrast.viewport[2] = width;
878         dpsoftrast.viewport[3] = height;
879         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
880 }
881
882 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
883 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
884 {
885         int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
886         unsigned int *p;
887         unsigned int c;
888         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
889         x1 = thread->fb_clearscissor[0];
890         y1 = thread->fb_clearscissor[1];
891         x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
892         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
893         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
894         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
895         if(y1 < t1) y1 = t1;
896         if(y2 > t2) y2 = t2;
897         w = x2 - x1;
898         h = y2 - y1;
899         if (w < 1 || h < 1)
900                 return;
901         // FIXME: honor fb_colormask?
902         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
903         for (i = 0;i < 4;i++)
904         {
905                 if (!dpsoftrast.fb_colorpixels[i])
906                         continue;
907                 for (y = y1;y < y2;y++)
908                 {
909                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
910                         for (x = x1;x < x2;x++)
911                                 p[x] = c;
912                 }
913         }
914 }
915 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
916 {
917         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
918         command->r = r;
919         command->g = g;
920         command->b = b;
921         command->a = a;
922 }
923
924 DEFCOMMAND(3, ClearDepth, float depth;)
925 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
926 {
927         int x1, y1, x2, y2, w, h, x, y, t1, t2;
928         unsigned int *p;
929         unsigned int c;
930         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
931         x1 = thread->fb_clearscissor[0];
932         y1 = thread->fb_clearscissor[1];
933         x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
934         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
935         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
936         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
937         if(y1 < t1) y1 = t1;
938         if(y2 > t2) y2 = t2;
939         w = x2 - x1;
940         h = y2 - y1;
941         if (w < 1 || h < 1)
942                 return;
943         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
944         for (y = y1;y < y2;y++)
945         {
946                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
947                 for (x = x1;x < x2;x++)
948                         p[x] = c;
949         }
950 }
951 void DPSOFTRAST_ClearDepth(float d)
952 {
953         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
954         command->depth = d;
955 }
956
957 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
958 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
959 {
960         thread->colormask[0] = command->r != 0;
961         thread->colormask[1] = command->g != 0;
962         thread->colormask[2] = command->b != 0;
963         thread->colormask[3] = command->a != 0;
964         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
965 }
966 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
967 {
968         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
969         command->r = r;
970         command->g = g;
971         command->b = b;
972         command->a = a;
973 }
974
975 DEFCOMMAND(5, DepthTest, int enable;)
976 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
977 {
978         thread->depthtest = command->enable;
979         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
980 }
981 void DPSOFTRAST_DepthTest(int enable)
982 {
983         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
984         command->enable = enable;
985 }
986
987 DEFCOMMAND(6, ScissorTest, int enable;)
988 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
989 {
990         thread->scissortest = command->enable;
991         thread->validate |= DPSOFTRAST_VALIDATE_FB;
992 }
993 void DPSOFTRAST_ScissorTest(int enable)
994 {
995         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
996         command->enable = enable;
997 }
998
999 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1000 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1001 {
1002         thread->scissor[0] = command->x;
1003         thread->scissor[1] = command->y;
1004         thread->scissor[2] = command->width;
1005         thread->scissor[3] = command->height;
1006         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1007 }
1008 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1009 {
1010         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1011         command->x = x;
1012         command->y = y;
1013         command->width = width;
1014         command->height = height;
1015 }
1016
1017 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1018 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1019 {
1020         thread->blendfunc[0] = command->sfactor;
1021         thread->blendfunc[1] = command->dfactor;
1022         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1023 }
1024 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1025 {
1026         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1027         command->sfactor = sfactor;
1028         command->dfactor = dfactor;
1029 }
1030
1031 DEFCOMMAND(9, BlendSubtract, int enable;)
1032 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1033 {
1034         thread->blendsubtract = command->enable;
1035         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1036 }
1037 void DPSOFTRAST_BlendSubtract(int enable)
1038 {
1039         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1040         command->enable = enable;
1041 }
1042
1043 DEFCOMMAND(10, DepthMask, int enable;)
1044 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1045 {
1046         thread->depthmask = command->enable;
1047 }
1048 void DPSOFTRAST_DepthMask(int enable)
1049 {
1050         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1051         command->enable = enable;
1052 }
1053
1054 DEFCOMMAND(11, DepthFunc, int func;)
1055 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1056 {
1057         thread->depthfunc = command->func;
1058 }
1059 void DPSOFTRAST_DepthFunc(int func)
1060 {
1061         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1062         command->func = func;
1063 }
1064
1065 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1066 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1067 {
1068         thread->depthrange[0] = command->nearval;
1069         thread->depthrange[1] = command->farval;
1070 }
1071 void DPSOFTRAST_DepthRange(float nearval, float farval)
1072 {
1073         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1074         command->nearval = nearval;
1075         command->farval = farval;
1076 }
1077
1078 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1079 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1080 {
1081         thread->polygonoffset[0] = command->alongnormal;
1082         thread->polygonoffset[1] = command->intoview;
1083 }
1084 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1085 {
1086         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1087         command->alongnormal = alongnormal;
1088         command->intoview = intoview;
1089 }
1090
1091 DEFCOMMAND(14, CullFace, int mode;)
1092 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1093 {
1094         thread->cullface = command->mode;
1095 }
1096 void DPSOFTRAST_CullFace(int mode)
1097 {
1098         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1099         command->mode = mode;
1100 }
1101
1102 DEFCOMMAND(15, AlphaTest, int enable;)
1103 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1104 {
1105         thread->alphatest = command->enable;
1106 }
1107 void DPSOFTRAST_AlphaTest(int enable)
1108 {
1109         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1110         command->enable = enable;
1111 }
1112
1113 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1114 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1115 {
1116         thread->alphafunc = command->func;
1117         thread->alphavalue = command->ref;
1118 }
1119 void DPSOFTRAST_AlphaFunc(int func, float ref)
1120 {
1121         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1122         command->func = func;
1123         command->ref = ref;
1124 }
1125
1126 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1127 {
1128         dpsoftrast.color[0] = r;
1129         dpsoftrast.color[1] = g;
1130         dpsoftrast.color[2] = b;
1131         dpsoftrast.color[3] = a;
1132 }
1133
1134 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1135 {
1136         int outstride = blockwidth * 4;
1137         int instride = dpsoftrast.fb_width * 4;
1138         int bx1 = blockx;
1139         int by1 = blocky;
1140         int bx2 = blockx + blockwidth;
1141         int by2 = blocky + blockheight;
1142         int bw;
1143         int bh;
1144         int x;
1145         int y;
1146         unsigned char *inpixels;
1147         unsigned char *b;
1148         unsigned char *o;
1149         DPSOFTRAST_Flush();
1150         if (bx1 < 0) bx1 = 0;
1151         if (by1 < 0) by1 = 0;
1152         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1153         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1154         bw = bx2 - bx1;
1155         bh = by2 - by1;
1156         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1157         if (dpsoftrast.bigendian)
1158         {
1159                 for (y = by1;y < by2;y++)
1160                 {
1161                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1162                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1163                         for (x = bx1;x < bx2;x++)
1164                         {
1165                                 o[0] = b[3];
1166                                 o[1] = b[2];
1167                                 o[2] = b[1];
1168                                 o[3] = b[0];
1169                                 o += 4;
1170                                 b += 4;
1171                         }
1172                 }
1173         }
1174         else
1175         {
1176                 for (y = by1;y < by2;y++)
1177                 {
1178                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1180                         memcpy(o, b, bw*4);
1181                 }
1182         }
1183
1184 }
1185 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1186 {
1187         int tx1 = tx;
1188         int ty1 = ty;
1189         int tx2 = tx + width;
1190         int ty2 = ty + height;
1191         int sx1 = sx;
1192         int sy1 = sy;
1193         int sx2 = sx + width;
1194         int sy2 = sy + height;
1195         int swidth;
1196         int sheight;
1197         int twidth;
1198         int theight;
1199         int sw;
1200         int sh;
1201         int tw;
1202         int th;
1203         int y;
1204         unsigned int *spixels;
1205         unsigned int *tpixels;
1206         DPSOFTRAST_Texture *texture;
1207         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1208         if (mip < 0 || mip >= texture->mipmaps) return;
1209         if (texture->binds)
1210                 DPSOFTRAST_Flush();
1211         spixels = dpsoftrast.fb_colorpixels[0];
1212         swidth = dpsoftrast.fb_width;
1213         sheight = dpsoftrast.fb_height;
1214         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1215         twidth = texture->mipmap[mip][2];
1216         theight = texture->mipmap[mip][3];
1217         if (tx1 < 0) tx1 = 0;
1218         if (ty1 < 0) ty1 = 0;
1219         if (tx2 > twidth) tx2 = twidth;
1220         if (ty2 > theight) ty2 = theight;
1221         if (sx1 < 0) sx1 = 0;
1222         if (sy1 < 0) sy1 = 0;
1223         if (sx2 > swidth) sx2 = swidth;
1224         if (sy2 > sheight) sy2 = sheight;
1225         tw = tx2 - tx1;
1226         th = ty2 - ty1;
1227         sw = sx2 - sx1;
1228         sh = sy2 - sy1;
1229         if (tw > sw) tw = sw;
1230         if (th > sh) th = sh;
1231         if (tw < 1 || th < 1)
1232                 return;
1233         for (y = 0;y < th;y++)
1234                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1235         if (texture->mipmaps > 1)
1236                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1237 }
1238
1239 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1240 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1241 {
1242         if (thread->texbound[command->unitnum])
1243                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1244         thread->texbound[command->unitnum] = command->texture;
1245 }
1246 void DPSOFTRAST_SetTexture(int unitnum, int index)
1247 {
1248         DPSOFTRAST_Command_SetTexture *command;
1249         DPSOFTRAST_Texture *texture;
1250         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1251         {
1252                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1253                 return;
1254         }
1255         texture = DPSOFTRAST_Texture_GetByIndex(index);
1256         if (index && !texture)
1257         {
1258                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1259                 return;
1260         }
1261
1262         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1263         command->unitnum = unitnum;
1264         command->texture = texture;
1265
1266         dpsoftrast.texbound[unitnum] = texture;
1267         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1268 }
1269
1270 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1271 {
1272         dpsoftrast.pointer_vertex3f = vertex3f;
1273         dpsoftrast.stride_vertex = stride;
1274 }
1275 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1276 {
1277         dpsoftrast.pointer_color4f = color4f;
1278         dpsoftrast.pointer_color4ub = NULL;
1279         dpsoftrast.stride_color = stride;
1280 }
1281 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1282 {
1283         dpsoftrast.pointer_color4f = NULL;
1284         dpsoftrast.pointer_color4ub = color4ub;
1285         dpsoftrast.stride_color = stride;
1286 }
1287 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1288 {
1289         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1290         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1291         dpsoftrast.stride_texcoord[unitnum] = stride;
1292 }
1293
1294 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1295 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1296 {
1297         thread->shader_mode = command->mode;
1298         thread->shader_permutation = command->permutation;
1299 }
1300 void DPSOFTRAST_SetShader(int mode, int permutation)
1301 {
1302         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1303         command->mode = mode;
1304         command->permutation = permutation;
1305
1306         dpsoftrast.shader_mode = mode;
1307         dpsoftrast.shader_permutation = permutation;
1308 }
1309
1310 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1311 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1312 {
1313         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1314 }
1315 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1316 {
1317         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1318         command->index = index;
1319         command->val[0] = v0;
1320         command->val[1] = v1;
1321         command->val[2] = v2;
1322         command->val[3] = v3;
1323
1324         dpsoftrast.uniform4f[index*4+0] = v0;
1325         dpsoftrast.uniform4f[index*4+1] = v1;
1326         dpsoftrast.uniform4f[index*4+2] = v2;
1327         dpsoftrast.uniform4f[index*4+3] = v3;
1328 }
1329 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1330 {
1331         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1332         command->index = index;
1333         memcpy(command->val, v, sizeof(command->val));
1334
1335         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1336 }
1337
1338 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1339 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1340 {
1341         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1342 }
1343 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1344 {
1345 #ifdef SSE2_PRESENT
1346         int i, index;
1347         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1348         {
1349                 __m128 m0, m1, m2, m3;
1350                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1351                 command->index = index;
1352                 if (((size_t)v)&(ALIGN_SIZE-1))
1353                 {
1354                         m0 = _mm_loadu_ps(v);
1355                         m1 = _mm_loadu_ps(v+4);
1356                         m2 = _mm_loadu_ps(v+8);
1357                         m3 = _mm_loadu_ps(v+12);
1358                 }
1359                 else
1360                 {
1361                         m0 = _mm_load_ps(v);
1362                         m1 = _mm_load_ps(v+4);
1363                         m2 = _mm_load_ps(v+8);
1364                         m3 = _mm_load_ps(v+12);
1365                 }
1366                 if (transpose)
1367                 {
1368                         __m128 t0, t1, t2, t3;
1369                         t0 = _mm_unpacklo_ps(m0, m1);
1370                         t1 = _mm_unpacklo_ps(m2, m3);
1371                         t2 = _mm_unpackhi_ps(m0, m1);
1372                         t3 = _mm_unpackhi_ps(m2, m3);
1373                         m0 = _mm_movelh_ps(t0, t1);
1374                         m1 = _mm_movehl_ps(t1, t0);
1375                         m2 = _mm_movelh_ps(t2, t3);
1376                         m3 = _mm_movehl_ps(t3, t2);                     
1377                 }
1378                 _mm_store_ps(command->val, m0);
1379                 _mm_store_ps(command->val+4, m1);
1380                 _mm_store_ps(command->val+8, m2);
1381                 _mm_store_ps(command->val+12, m3);
1382                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1383                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1384                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1385                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1386         }
1387 #endif
1388 }
1389
1390 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1391 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1392 {
1393         thread->uniform1i[command->index] = command->val;
1394 }
1395 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1396 {
1397         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1398         command->index = index;
1399         command->val = i0;
1400
1401         dpsoftrast.uniform1i[command->index] = i0;
1402 }
1403
1404 #ifdef SSE2_PRESENT
1405 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1406 {
1407         float *end = dst + size*4;
1408         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1409         {
1410                 while (dst < end)
1411                 {
1412                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1413                         dst += 4;
1414                         src += stride;
1415                 }
1416         }
1417         else
1418         {
1419                 while (dst < end)
1420                 {
1421                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1422                         dst += 4;
1423                         src += stride;
1424                 }
1425         }
1426 }
1427
1428 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1429 {
1430         float *end = dst + size*4;
1431         if (stride == sizeof(float[3]))
1432         {
1433                 float *end4 = dst + (size&~3)*4;        
1434                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1435                 {
1436                         while (dst < end4)
1437                         {
1438                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1439                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1440                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1441                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1442                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1443                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1444                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1445                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1446                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1447                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1448                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1449                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1450                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1451                                 dst += 16;
1452                                 src += 4*sizeof(float[3]);
1453                         }
1454                 }
1455                 else
1456                 {
1457                         while (dst < end4)
1458                         {
1459                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1460                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1461                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1462                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1463                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1464                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1467                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1468                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1469                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1470                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1471                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1472                                 dst += 16;
1473                                 src += 4*sizeof(float[3]);
1474                         }
1475                 }
1476         }
1477         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1478         {
1479                 while (dst < end)
1480                 {
1481                         __m128 v = _mm_loadu_ps((const float *)src);
1482                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1483                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1484                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1485                         _mm_store_ps(dst, v);
1486                         dst += 4;
1487                         src += stride;
1488                 }
1489         }
1490         else
1491         {
1492                 while (dst < end)
1493                 {
1494                         __m128 v = _mm_load_ps((const float *)src);
1495                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1496                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1497                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1498                         _mm_store_ps(dst, v);
1499                         dst += 4;
1500                         src += stride;
1501                 }
1502         }
1503 }
1504
1505 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1506 {
1507         float *end = dst + size*4;
1508         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1509         if (stride == sizeof(float[2]))
1510         {
1511                 float *end2 = dst + (size&~1)*4;
1512                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1513                 {
1514                         while (dst < end2)
1515                         {
1516                                 __m128 v = _mm_loadu_ps((const float *)src);
1517                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1518                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1519                                 dst += 8;
1520                                 src += 2*sizeof(float[2]);
1521                         }
1522                 }
1523                 else
1524                 {
1525                         while (dst < end2)
1526                         {
1527                                 __m128 v = _mm_load_ps((const float *)src);
1528                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1529                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1530                                 dst += 8;
1531                                 src += 2*sizeof(float[2]);
1532                         }
1533                 }
1534         }
1535         while (dst < end)
1536         {
1537                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1538                 dst += 4;
1539                 src += stride;
1540         }
1541 }
1542
1543 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1544 {
1545         float *end = dst + size*4;
1546         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1547         if (stride == sizeof(unsigned char[4]))
1548         {
1549                 float *end4 = dst + (size&~3)*4;
1550                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1551                 {
1552                         while (dst < end4)
1553                         {
1554                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1555                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1556                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1557                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1558                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1559                                 dst += 16;
1560                                 src += 4*sizeof(unsigned char[4]);
1561                         }
1562                 }
1563                 else
1564                 {
1565                         while (dst < end4)
1566                         {
1567                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1568                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1569                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1570                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1571                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1572                                 dst += 16;
1573                                 src += 4*sizeof(unsigned char[4]);
1574                         }
1575                 }
1576         }
1577         while (dst < end)
1578         {
1579                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1580                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1581                 dst += 4;
1582                 src += stride;
1583         }
1584 }
1585
1586 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1587 {
1588         float *end = dst + 4*size;
1589         __m128 v = _mm_loadu_ps(src);
1590         while (dst < end)
1591         {
1592                 _mm_store_ps(dst, v);
1593                 dst += 4;
1594         }
1595 }
1596 #endif
1597
1598 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1599 {
1600 #ifdef SSE2_PRESENT
1601         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1602         __m128 m0, m1, m2, m3;
1603         float *end;
1604         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1605         {
1606                 // fast case for identity matrix
1607                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1608                 return;
1609         }
1610         end = out4f + numitems*4;
1611         m0 = _mm_loadu_ps(inmatrix16f);
1612         m1 = _mm_loadu_ps(inmatrix16f + 4);
1613         m2 = _mm_loadu_ps(inmatrix16f + 8);
1614         m3 = _mm_loadu_ps(inmatrix16f + 12);
1615         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1616         {
1617                 while (out4f < end)
1618                 {
1619                         __m128 v = _mm_loadu_ps(in4f);
1620                         _mm_store_ps(out4f,
1621                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1622                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1623                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1624                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1625                         out4f += 4;
1626                         in4f += 4;
1627                 }
1628         }
1629         else
1630         {
1631                 while (out4f < end)
1632                 {
1633                         __m128 v = _mm_load_ps(in4f);
1634                         _mm_store_ps(out4f,
1635                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1636                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1637                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1638                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1639                         out4f += 4;
1640                         in4f += 4;
1641                 }
1642         }
1643 #endif
1644 }
1645
1646 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1647 {
1648         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1649 }
1650
1651 #ifdef SSE2_PRESENT
1652 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1653 { \
1654         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1655         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1656         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1657         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1658 }
1659
1660 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1661 { \
1662         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1663         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1664         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1665         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1666 }
1667
1668 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1669 { \
1670         __m128 p = (in); \
1671         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1672                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1673                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1674                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1675 }
1676
1677 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1678 {
1679         int clipmask = 0xFF;
1680         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1681         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1682         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1683         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1684         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1685         #define BBFRONT(k, pos) \
1686         { \
1687                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1688                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1689                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1690                 { \
1691                         __m128 proj; \
1692                         clipmask &= ~(1<<k); \
1693                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1694                         minproj = _mm_min_ss(minproj, proj); \
1695                         maxproj = _mm_max_ss(maxproj, proj); \
1696                 } \
1697         }
1698         BBFRONT(0, minpos); 
1699         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1700         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1701         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1702         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1703         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1704         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1705         BBFRONT(7, maxpos);
1706         #define BBCLIP(k) \
1707         { \
1708                 if (clipmask&(1<<k)) \
1709                 { \
1710                         if (!(clipmask&(1<<(k^1)))) \
1711                         { \
1712                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1713                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1714                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1715                                 minproj = _mm_min_ss(minproj, proj); \
1716                                 maxproj = _mm_max_ss(maxproj, proj); \
1717                         } \
1718                         if (!(clipmask&(1<<(k^2)))) \
1719                         { \
1720                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1721                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1722                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1723                                 minproj = _mm_min_ss(minproj, proj); \
1724                                 maxproj = _mm_max_ss(maxproj, proj); \
1725                         } \
1726                         if (!(clipmask&(1<<(k^4)))) \
1727                         { \
1728                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1729                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1730                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1731                                 minproj = _mm_min_ss(minproj, proj); \
1732                                 maxproj = _mm_max_ss(maxproj, proj); \
1733                         } \
1734                 } \
1735         }
1736         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1737         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1738         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1739         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1740         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1741         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1742         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1743         *starty = _mm_cvttss_si32(maxproj);
1744         *endy = _mm_cvttss_si32(minproj)+1;
1745         return clipmask;
1746 }
1747 #endif
1748         
1749 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1750 {
1751 #ifdef SSE2_PRESENT
1752         float *end = out4f + numitems*4;
1753         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1754         __m128 minpos, maxpos;
1755         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1756         {
1757                 minpos = maxpos = _mm_loadu_ps(in4f);
1758                 while (out4f < end)
1759                 {
1760                         __m128 v = _mm_loadu_ps(in4f);
1761                         minpos = _mm_min_ps(minpos, v);
1762                         maxpos = _mm_max_ps(maxpos, v);
1763                         _mm_store_ps(out4f, v);
1764                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1765                         _mm_store_ps(screen4f, v);
1766                         in4f += 4;
1767                         out4f += 4;
1768                         screen4f += 4;
1769                 }
1770         }
1771         else
1772         {
1773                 minpos = maxpos = _mm_load_ps(in4f);
1774                 while (out4f < end)
1775                 {
1776                         __m128 v = _mm_load_ps(in4f);
1777                         minpos = _mm_min_ps(minpos, v);
1778                         maxpos = _mm_max_ps(maxpos, v);
1779                         _mm_store_ps(out4f, v);
1780                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1781                         _mm_store_ps(screen4f, v);
1782                         in4f += 4;
1783                         out4f += 4;
1784                         screen4f += 4;
1785                 }
1786         }
1787         if (starty && endy) 
1788                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1789                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1790                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1791                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1792                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1793         return 0;
1794 #endif
1795 }
1796
1797 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1798 {
1799 #ifdef SSE2_PRESENT
1800         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1801         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1802         float *end;
1803         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1804                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1805         end = out4f + numitems*4;
1806         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1807         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1808         m0 = _mm_loadu_ps(inmatrix16f);
1809         m1 = _mm_loadu_ps(inmatrix16f + 4);
1810         m2 = _mm_loadu_ps(inmatrix16f + 8);
1811         m3 = _mm_loadu_ps(inmatrix16f + 12);
1812         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1813         {
1814                 minpos = maxpos = _mm_loadu_ps(in4f);
1815                 while (out4f < end)
1816                 {
1817                         __m128 v = _mm_loadu_ps(in4f);
1818                         minpos = _mm_min_ps(minpos, v);
1819                         maxpos = _mm_max_ps(maxpos, v);
1820                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1821                         _mm_store_ps(out4f, v);
1822                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1823                         _mm_store_ps(screen4f, v);
1824                         in4f += 4;
1825                         out4f += 4;
1826                         screen4f += 4;
1827                 }
1828         }
1829         else
1830         {
1831                 minpos = maxpos = _mm_load_ps(in4f);
1832                 while (out4f < end)
1833                 {
1834                         __m128 v = _mm_load_ps(in4f);
1835                         minpos = _mm_min_ps(minpos, v);
1836                         maxpos = _mm_max_ps(maxpos, v);
1837                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1838                         _mm_store_ps(out4f, v);
1839                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1840                         _mm_store_ps(screen4f, v);
1841                         in4f += 4;
1842                         out4f += 4;
1843                         screen4f += 4;
1844                 }
1845         }
1846         if (starty && endy) 
1847                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1848         return 0;
1849 #endif
1850 }
1851
1852 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1853 {
1854         float *outf = dpsoftrast.post_array4f[outarray];
1855         const unsigned char *inb;
1856         int firstvertex = dpsoftrast.firstvertex;
1857         int numvertices = dpsoftrast.numvertices;
1858         int stride;
1859         switch(inarray)
1860         {
1861         case DPSOFTRAST_ARRAY_POSITION:
1862                 stride = dpsoftrast.stride_vertex;
1863                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1864                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1865                 break;
1866         case DPSOFTRAST_ARRAY_COLOR:
1867                 stride = dpsoftrast.stride_color;
1868                 if (dpsoftrast.pointer_color4f)
1869                 {
1870                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1871                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1872                 }
1873                 else if (dpsoftrast.pointer_color4ub)
1874                 {
1875                         stride = dpsoftrast.stride_color;
1876                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1877                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1878                 }
1879                 else
1880                 {
1881                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1882                 }
1883                 break;
1884         default:
1885                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1886                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1887                 {
1888                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1889                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1890                         {
1891                         case 2:
1892                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1893                                 break;
1894                         case 3:
1895                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1896                                 break;
1897                         case 4:
1898                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1899                                 break;
1900                         }
1901                 }
1902                 break;
1903         }
1904         return outf;
1905 }
1906
1907 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1908 {
1909         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1910         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1911         return data;
1912 }
1913
1914 #if 0
1915 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1916 {
1917         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1918         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1919         return data;
1920 }
1921 #endif
1922
1923 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1924 {
1925         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1926         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1927         return data;
1928 }
1929
1930 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1931 {
1932         int x;
1933         int startx = span->startx;
1934         int endx = span->endx;
1935         float wslope = triangle->w[0];
1936         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1937         float endz = 1.0f / (w + wslope * startx);
1938         for (x = startx;x < endx;)
1939         {
1940                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1941                 float z = endz, dz;
1942                 if(nextsub >= endx) nextsub = endsub = endx-1;
1943                 endz = 1.0f / (w + wslope * nextsub);
1944                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1945                 for (; x <= endsub; x++, z += dz)
1946                         zf[x] = z;
1947         }
1948 }
1949
1950 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1951 {
1952         int x;
1953         int startx = span->startx;
1954         int endx = span->endx;
1955         int d[4];
1956         float a, b;
1957         unsigned char * RESTRICT pixelmask = span->pixelmask;
1958         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1959         if (!pixel)
1960                 return;
1961         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1962         // handle alphatest now (this affects depth writes too)
1963         if (thread->alphatest)
1964                 for (x = startx;x < endx;x++)
1965                         if (in4f[x*4+3] < 0.5f)
1966                                 pixelmask[x] = false;
1967         // FIXME: this does not handle bigendian
1968         switch(thread->fb_blendmode)
1969         {
1970         case DPSOFTRAST_BLENDMODE_OPAQUE:
1971                 for (x = startx;x < endx;x++)
1972                 {
1973                         if (!pixelmask[x])
1974                                 continue;
1975                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1976                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1977                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1978                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1979                         pixel[x*4+0] = d[0];
1980                         pixel[x*4+1] = d[1];
1981                         pixel[x*4+2] = d[2];
1982                         pixel[x*4+3] = d[3];
1983                 }
1984                 break;
1985         case DPSOFTRAST_BLENDMODE_ALPHA:
1986                 for (x = startx;x < endx;x++)
1987                 {
1988                         if (!pixelmask[x])
1989                                 continue;
1990                         a = in4f[x*4+3] * 255.0f;
1991                         b = 1.0f - in4f[x*4+3];
1992                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1993                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1994                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1995                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1996                         pixel[x*4+0] = d[0];
1997                         pixel[x*4+1] = d[1];
1998                         pixel[x*4+2] = d[2];
1999                         pixel[x*4+3] = d[3];
2000                 }
2001                 break;
2002         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2003                 for (x = startx;x < endx;x++)
2004                 {
2005                         if (!pixelmask[x])
2006                                 continue;
2007                         a = in4f[x*4+3] * 255.0f;
2008                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2009                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2010                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2011                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2012                         pixel[x*4+0] = d[0];
2013                         pixel[x*4+1] = d[1];
2014                         pixel[x*4+2] = d[2];
2015                         pixel[x*4+3] = d[3];
2016                 }
2017                 break;
2018         case DPSOFTRAST_BLENDMODE_ADD:
2019                 for (x = startx;x < endx;x++)
2020                 {
2021                         if (!pixelmask[x])
2022                                 continue;
2023                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2024                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2025                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2026                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2027                         pixel[x*4+0] = d[0];
2028                         pixel[x*4+1] = d[1];
2029                         pixel[x*4+2] = d[2];
2030                         pixel[x*4+3] = d[3];
2031                 }
2032                 break;
2033         case DPSOFTRAST_BLENDMODE_INVMOD:
2034                 for (x = startx;x < endx;x++)
2035                 {
2036                         if (!pixelmask[x])
2037                                 continue;
2038                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2039                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2040                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2041                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2042                         pixel[x*4+0] = d[0];
2043                         pixel[x*4+1] = d[1];
2044                         pixel[x*4+2] = d[2];
2045                         pixel[x*4+3] = d[3];
2046                 }
2047                 break;
2048         case DPSOFTRAST_BLENDMODE_MUL:
2049                 for (x = startx;x < endx;x++)
2050                 {
2051                         if (!pixelmask[x])
2052                                 continue;
2053                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2054                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2055                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2056                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2057                         pixel[x*4+0] = d[0];
2058                         pixel[x*4+1] = d[1];
2059                         pixel[x*4+2] = d[2];
2060                         pixel[x*4+3] = d[3];
2061                 }
2062                 break;
2063         case DPSOFTRAST_BLENDMODE_MUL2:
2064                 for (x = startx;x < endx;x++)
2065                 {
2066                         if (!pixelmask[x])
2067                                 continue;
2068                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2069                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2070                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2071                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2072                         pixel[x*4+0] = d[0];
2073                         pixel[x*4+1] = d[1];
2074                         pixel[x*4+2] = d[2];
2075                         pixel[x*4+3] = d[3];
2076                 }
2077                 break;
2078         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2079                 for (x = startx;x < endx;x++)
2080                 {
2081                         if (!pixelmask[x])
2082                                 continue;
2083                         a = in4f[x*4+3] * -255.0f;
2084                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2085                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2086                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2087                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2088                         pixel[x*4+0] = d[0];
2089                         pixel[x*4+1] = d[1];
2090                         pixel[x*4+2] = d[2];
2091                         pixel[x*4+3] = d[3];
2092                 }
2093                 break;
2094         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2095                 for (x = startx;x < endx;x++)
2096                 {
2097                         if (!pixelmask[x])
2098                                 continue;
2099                         a = 255.0f;
2100                         b = 1.0f - in4f[x*4+3];
2101                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2102                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2103                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2104                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2105                         pixel[x*4+0] = d[0];
2106                         pixel[x*4+1] = d[1];
2107                         pixel[x*4+2] = d[2];
2108                         pixel[x*4+3] = d[3];
2109                 }
2110                 break;
2111         }
2112 }
2113
2114 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2115 {
2116 #ifdef SSE2_PRESENT
2117         int x;
2118         int startx = span->startx;
2119         int endx = span->endx;
2120         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2121         unsigned char * RESTRICT pixelmask = span->pixelmask;
2122         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2123         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2124         if (!pixel)
2125                 return;
2126         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2127         pixeli += span->y * dpsoftrast.fb_width + span->x;
2128         // handle alphatest now (this affects depth writes too)
2129         if (thread->alphatest)
2130                 for (x = startx;x < endx;x++)
2131                         if (in4ub[x*4+3] < 0.5f)
2132                                 pixelmask[x] = false;
2133         // FIXME: this does not handle bigendian
2134         switch(thread->fb_blendmode)
2135         {
2136         case DPSOFTRAST_BLENDMODE_OPAQUE:
2137                 for (x = startx;x + 4 <= endx;)
2138                 {
2139                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2140                         {
2141                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2142                                 x += 4;
2143                         }
2144                         else
2145                         {
2146                                 if (pixelmask[x])
2147                                         pixeli[x] = ini[x];
2148                                 x++;
2149                         }
2150                 }
2151                 for (;x < endx;x++)
2152                         if (pixelmask[x])
2153                                 pixeli[x] = ini[x];
2154                 break;
2155         case DPSOFTRAST_BLENDMODE_ALPHA:
2156         #define FINISHBLEND(blend2, blend1) \
2157                 for (x = startx;x + 2 <= endx;x += 2) \
2158                 { \
2159                         __m128i src, dst; \
2160                         switch (*(const unsigned short*)&pixelmask[x]) \
2161                         { \
2162                         case 0x0101: \
2163                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2164                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2165                                 blend2; \
2166                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2167                                 continue; \
2168                         case 0x0100: \
2169                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2170                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2171                                 blend1; \
2172                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2173                                 continue; \
2174                         case 0x0001: \
2175                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2176                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2177                                 blend1; \
2178                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2179                                 continue; \
2180                         } \
2181                         break; \
2182                 } \
2183                 for(;x < endx; x++) \
2184                 { \
2185                         __m128i src, dst; \
2186                         if (!pixelmask[x]) \
2187                                 continue; \
2188                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2189                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2190                         blend1; \
2191                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2192                 }
2193
2194                 FINISHBLEND({
2195                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2196                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2197                 }, {
2198                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2199                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2200                 });
2201                 break;
2202         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2203                 FINISHBLEND({
2204                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2205                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2206                 }, {
2207                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2208                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2209                 });
2210                 break;
2211         case DPSOFTRAST_BLENDMODE_ADD:
2212                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2213                 break;
2214         case DPSOFTRAST_BLENDMODE_INVMOD:
2215                 FINISHBLEND({
2216                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2217                 }, {
2218                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2219                 });
2220                 break;
2221         case DPSOFTRAST_BLENDMODE_MUL:
2222                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2223                 break;
2224         case DPSOFTRAST_BLENDMODE_MUL2:
2225                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2226                 break;
2227         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2228                 FINISHBLEND({
2229                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2230                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2231                 }, {
2232                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2233                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2234                 });
2235                 break;
2236         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2237                 FINISHBLEND({
2238                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2239                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2240                 }, {
2241                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2242                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2243                 });
2244                 break;
2245         }
2246 #endif
2247 }
2248
2249 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2250 {
2251         int x;
2252         int startx = span->startx;
2253         int endx = span->endx;
2254         int flags;
2255         float c[4];
2256         float data[4];
2257         float slope[4];
2258         float tc[2], endtc[2];
2259         float tcscale[2];
2260         unsigned int tci[2];
2261         unsigned int tci1[2];
2262         unsigned int tcimin[2];
2263         unsigned int tcimax[2];
2264         int tciwrapmask[2];
2265         int tciwidth;
2266         int filter;
2267         int mip;
2268         const unsigned char * RESTRICT pixelbase;
2269         const unsigned char * RESTRICT pixel[4];
2270         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2271         // if no texture is bound, just fill it with white
2272         if (!texture)
2273         {
2274                 for (x = startx;x < endx;x++)
2275                 {
2276                         out4f[x*4+0] = 1.0f;
2277                         out4f[x*4+1] = 1.0f;
2278                         out4f[x*4+2] = 1.0f;
2279                         out4f[x*4+3] = 1.0f;
2280                 }
2281                 return;
2282         }
2283         mip = triangle->mip[texunitindex];
2284         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2285         // if this mipmap of the texture is 1 pixel, just fill it with that color
2286         if (texture->mipmap[mip][1] == 4)
2287         {
2288                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2289                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2290                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2291                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2292                 for (x = startx;x < endx;x++)
2293                 {
2294                         out4f[x*4+0] = c[0];
2295                         out4f[x*4+1] = c[1];
2296                         out4f[x*4+2] = c[2];
2297                         out4f[x*4+3] = c[3];
2298                 }
2299                 return;
2300         }
2301         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2302         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2303         flags = texture->flags;
2304         tcscale[0] = texture->mipmap[mip][2];
2305         tcscale[1] = texture->mipmap[mip][3];
2306         tciwidth = texture->mipmap[mip][2];
2307         tcimin[0] = 0;
2308         tcimin[1] = 0;
2309         tcimax[0] = texture->mipmap[mip][2]-1;
2310         tcimax[1] = texture->mipmap[mip][3]-1;
2311         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2312         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2313         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2314         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2315         for (x = startx;x < endx;)
2316         {
2317                 unsigned int subtc[2];
2318                 unsigned int substep[2];
2319                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2320                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2321                 if(nextsub >= endx)
2322                 {
2323                         nextsub = endsub = endx-1;      
2324                         if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2325                 }
2326                 tc[0] = endtc[0];
2327                 tc[1] = endtc[1];
2328                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2329                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2330                 substep[0] = (endtc[0] - tc[0]) * subscale;
2331                 substep[1] = (endtc[1] - tc[1]) * subscale;
2332                 subtc[0] = tc[0] * (1<<16);
2333                 subtc[1] = tc[1] * (1<<16);
2334                 if(filter)
2335                 {
2336                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2337                         {
2338                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2339                                 {
2340                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2341                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2342                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2343                                         tci[0] = subtc[0]>>16;
2344                                         tci[1] = subtc[1]>>16;
2345                                         tci1[0] = tci[0] + 1;
2346                                         tci1[1] = tci[1] + 1;
2347                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2348                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2349                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2350                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2351                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2352                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2353                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2354                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2355                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2356                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2357                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2358                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2359                                         out4f[x*4+0] = c[0];
2360                                         out4f[x*4+1] = c[1];
2361                                         out4f[x*4+2] = c[2];
2362                                         out4f[x*4+3] = c[3];
2363                                 }
2364                         }
2365                         else
2366                         {
2367                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2368                                 {
2369                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2370                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2371                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2372                                         tci[0] = subtc[0]>>16;
2373                                         tci[1] = subtc[1]>>16;
2374                                         tci1[0] = tci[0] + 1;
2375                                         tci1[1] = tci[1] + 1;
2376                                         tci[0] &= tciwrapmask[0];
2377                                         tci[1] &= tciwrapmask[1];
2378                                         tci1[0] &= tciwrapmask[0];
2379                                         tci1[1] &= tciwrapmask[1];
2380                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2381                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2382                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2383                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2384                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2385                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2386                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2387                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2388                                         out4f[x*4+0] = c[0];
2389                                         out4f[x*4+1] = c[1];
2390                                         out4f[x*4+2] = c[2];
2391                                         out4f[x*4+3] = c[3];
2392                                 }
2393                         }
2394                 }
2395                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2396                 {
2397                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2398                         {
2399                                 tci[0] = subtc[0]>>16;
2400                                 tci[1] = subtc[1]>>16;
2401                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2402                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2403                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2404                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2405                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2406                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2407                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2408                                 out4f[x*4+0] = c[0];
2409                                 out4f[x*4+1] = c[1];
2410                                 out4f[x*4+2] = c[2];
2411                                 out4f[x*4+3] = c[3];
2412                         }
2413                 }
2414                 else
2415                 {
2416                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2417                         {
2418                                 tci[0] = subtc[0]>>16;
2419                                 tci[1] = subtc[1]>>16;
2420                                 tci[0] &= tciwrapmask[0];
2421                                 tci[1] &= tciwrapmask[1];
2422                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2423                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2424                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2425                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2426                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2427                                 out4f[x*4+0] = c[0];
2428                                 out4f[x*4+1] = c[1];
2429                                 out4f[x*4+2] = c[2];
2430                                 out4f[x*4+3] = c[3];
2431                         }
2432                 }
2433         }
2434 }
2435
2436 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2437 {
2438 #ifdef SSE2_PRESENT
2439         int x;
2440         int startx = span->startx;
2441         int endx = span->endx;
2442         int flags;
2443         __m128 data, slope, tcscale;
2444         __m128i tcsize, tcmask, tcoffset, tcmax;
2445         __m128 tc, endtc;
2446         __m128i subtc, substep, endsubtc;
2447         int filter;
2448         int mip;
2449         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2450         const unsigned char * RESTRICT pixelbase;
2451         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2452         // if no texture is bound, just fill it with white
2453         if (!texture)
2454         {
2455                 memset(out4ub + startx*4, 255, span->length*4);
2456                 return;
2457         }
2458         mip = triangle->mip[texunitindex];
2459         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2460         // if this mipmap of the texture is 1 pixel, just fill it with that color
2461         if (texture->mipmap[mip][1] == 4)
2462         {
2463                 unsigned int k = *((const unsigned int *)pixelbase);
2464                 for (x = startx;x < endx;x++)
2465                         outi[x] = k;
2466                 return;
2467         }
2468         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2469         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2470         flags = texture->flags;
2471         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2472         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2473         tcscale = _mm_cvtepi32_ps(tcsize);
2474         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2475         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2476         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2477         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2478         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2479         tcmax = _mm_packs_epi32(tcmask, tcmask);
2480         for (x = startx;x < endx;)
2481         {
2482                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2483                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2484                 if(nextsub >= endx)
2485                 {
2486                         nextsub = endsub = endx-1;
2487                         if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2488                 }       
2489                 tc = endtc;
2490                 subtc = endsubtc;
2491                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2492                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2493                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2494                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2495                 substep = _mm_slli_epi32(substep, 1);
2496                 if (filter)
2497                 {
2498                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2499                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2500                         {
2501                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2502                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2503                                 {
2504                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2505                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2506                                         tci = _mm_madd_epi16(tci, tcoffset);
2507                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2508                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2509                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2510                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2511                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2512                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2513                                         fracm = _mm_srli_epi16(subtc, 1);
2514                                         pix1 = _mm_add_epi16(pix1,
2515                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2516                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2517                                         pix3 = _mm_add_epi16(pix3,
2518                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2519                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2520                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2521                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2522                                         pix2 = _mm_add_epi16(pix2,
2523                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2524                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2525                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2526                                 }
2527                                 if (x <= endsub)
2528                                 {
2529                                         const unsigned char * RESTRICT ptr1;
2530                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2531                                         tci = _mm_madd_epi16(tci, tcoffset);
2532                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2533                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2534                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2535                                         fracm = _mm_srli_epi16(subtc, 1);
2536                                         pix1 = _mm_add_epi16(pix1,
2537                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2538                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2539                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2540                                         pix1 = _mm_add_epi16(pix1,
2541                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2542                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2543                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2544                                         x++;
2545                                 }
2546                         }
2547                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2548                         {
2549                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2550                                 {
2551                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2552                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2553                                         tci = _mm_madd_epi16(tci, tcoffset);
2554                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2555                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2556                                                                                         _mm_setzero_si128());
2557                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2558                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2559                                                                                         _mm_setzero_si128());
2560                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2561                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2562                                         tci = _mm_madd_epi16(tci, tcoffset);
2563                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2564                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2565                                                                                         _mm_setzero_si128());
2566                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2567                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2568                                                                                         _mm_setzero_si128());
2569                                         fracm = _mm_srli_epi16(subtc, 1);
2570                                         pix1 = _mm_add_epi16(pix1,
2571                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2572                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2573                                         pix3 = _mm_add_epi16(pix3,
2574                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2575                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2576                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2577                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2578                                         pix2 = _mm_add_epi16(pix2,
2579                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2580                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2581                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2582                                 }
2583                                 if (x <= endsub)
2584                                 {
2585                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2586                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2587                                         tci = _mm_madd_epi16(tci, tcoffset);
2588                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2589                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2590                                                                                         _mm_setzero_si128());
2591                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2592                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2593                                                                                         _mm_setzero_si128());
2594                                         fracm = _mm_srli_epi16(subtc, 1);
2595                                         pix1 = _mm_add_epi16(pix1,
2596                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2597                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2598                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2599                                         pix1 = _mm_add_epi16(pix1,
2600                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2601                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2602                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2603                                         x++;
2604                                 }
2605                         }
2606                         else
2607                         {
2608                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2609                                 {
2610                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2611                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2612                                         tci = _mm_madd_epi16(tci, tcoffset);
2613                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2614                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2615                                                                                         _mm_setzero_si128());
2616                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2617                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2618                                                                                         _mm_setzero_si128());
2619                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2620                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2621                                         tci = _mm_madd_epi16(tci, tcoffset);
2622                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2623                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2624                                                                                         _mm_setzero_si128());
2625                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2626                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2627                                                                                         _mm_setzero_si128());
2628                                         fracm = _mm_srli_epi16(subtc, 1);
2629                                         pix1 = _mm_add_epi16(pix1,
2630                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2631                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2632                                         pix3 = _mm_add_epi16(pix3,
2633                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2634                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2635                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2636                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2637                                         pix2 = _mm_add_epi16(pix2,
2638                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2639                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2640                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2641                                 }
2642                                 if (x <= endsub)
2643                                 {
2644                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2645                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2646                                         tci = _mm_madd_epi16(tci, tcoffset);
2647                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2648                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2649                                                                                         _mm_setzero_si128());
2650                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2651                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2652                                                                                         _mm_setzero_si128());
2653                                         fracm = _mm_srli_epi16(subtc, 1);
2654                                         pix1 = _mm_add_epi16(pix1,
2655                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2656                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2657                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2658                                         pix1 = _mm_add_epi16(pix1,
2659                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2660                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2661                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2662                                         x++;
2663                                 }
2664                         }
2665                 }
2666                 else
2667                 {
2668                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2669                         {
2670                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2671                                 {
2672                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2673                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2674                                         tci = _mm_madd_epi16(tci, tcoffset);
2675                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2676                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2677                                 }
2678                                 if (x <= endsub)
2679                                 {
2680                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2681                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2682                                         tci = _mm_madd_epi16(tci, tcoffset);
2683                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2684                                         x++;
2685                                 }
2686                         }
2687                         else
2688                         {
2689                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2690                                 {
2691                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2692                                         tci = _mm_and_si128(tci, tcmax); 
2693                                         tci = _mm_madd_epi16(tci, tcoffset);
2694                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2695                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2696                                 }
2697                                 if (x <= endsub)
2698                                 {
2699                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2700                                         tci = _mm_and_si128(tci, tcmax); 
2701                                         tci = _mm_madd_epi16(tci, tcoffset);
2702                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2703                                         x++;
2704                                 }
2705                         }
2706                 }
2707         }
2708 #endif
2709 }
2710
2711 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2712 {
2713         // TODO: IMPLEMENT
2714         memset(out4ub, 255, span->length*4);
2715 }
2716
2717 float DPSOFTRAST_SampleShadowmap(const float *vector)
2718 {
2719         // TODO: IMPLEMENT
2720         return 1.0f;
2721 }
2722
2723 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2724 {
2725         int x;
2726         int startx = span->startx;
2727         int endx = span->endx;
2728         float c[4];
2729         float data[4];
2730         float slope[4];
2731         float z;
2732         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2733         for (x = startx;x < endx;x++)
2734         {
2735                 z = zf[x];
2736                 c[0] = (data[0] + slope[0]*x) * z;
2737                 c[1] = (data[1] + slope[1]*x) * z;
2738                 c[2] = (data[2] + slope[2]*x) * z;
2739                 c[3] = (data[3] + slope[3]*x) * z;
2740                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2741                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2742                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2743                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2744         }
2745 }
2746
2747 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2748 {
2749         int x;
2750         int startx = span->startx;
2751         int endx = span->endx;
2752         float c[4];
2753         float data[4];
2754         float slope[4];
2755         float z;
2756         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2757         for (x = startx;x < endx;x++)
2758         {
2759                 z = zf[x];
2760                 c[0] = (data[0] + slope[0]*x) * z;
2761                 c[1] = (data[1] + slope[1]*x) * z;
2762                 c[2] = (data[2] + slope[2]*x) * z;
2763                 c[3] = (data[3] + slope[3]*x) * z;
2764                 out4f[x*4+0] = c[0];
2765                 out4f[x*4+1] = c[1];
2766                 out4f[x*4+2] = c[2];
2767                 out4f[x*4+3] = c[3];
2768         }
2769 }
2770
2771 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2772 {
2773         int x, startx = span->startx, endx = span->endx;
2774         float c[4], localcolor[4];
2775         localcolor[0] = subcolor[0];
2776         localcolor[1] = subcolor[1];
2777         localcolor[2] = subcolor[2];
2778         localcolor[3] = subcolor[3];
2779         for (x = startx;x < endx;x++)
2780         {
2781                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2782                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2783                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2784                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2785                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2786                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2787                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2788                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2789         }
2790 }
2791
2792 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2793 {
2794         int x, startx = span->startx, endx = span->endx;
2795         for (x = startx;x < endx;x++)
2796         {
2797                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2798                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2799                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2800                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2801         }
2802 }
2803
2804 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2805 {
2806         int x, startx = span->startx, endx = span->endx;
2807         for (x = startx;x < endx;x++)
2808         {
2809                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2810                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2811                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2812                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2813         }
2814 }
2815
2816 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2817 {
2818         int x, startx = span->startx, endx = span->endx;
2819         float a, b;
2820         for (x = startx;x < endx;x++)
2821         {
2822                 a = 1.0f - inb4f[x*4+3];
2823                 b = inb4f[x*4+3];
2824                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2825                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2826                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2827                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2828         }
2829 }
2830
2831 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2832 {
2833         int x, startx = span->startx, endx = span->endx;
2834         float localcolor[4], ilerp, lerp;
2835         localcolor[0] = color[0];
2836         localcolor[1] = color[1];
2837         localcolor[2] = color[2];
2838         localcolor[3] = color[3];
2839         ilerp = 1.0f - localcolor[3];
2840         lerp = localcolor[3];
2841         for (x = startx;x < endx;x++)
2842         {
2843                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2844                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2845                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2846                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2847         }
2848 }
2849
2850
2851
2852 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2853 {
2854 #ifdef SSE2_PRESENT
2855         int x;
2856         int startx = span->startx;
2857         int endx = span->endx;
2858         __m128 data, slope;
2859         __m128 mod, endmod;
2860         __m128i submod, substep, endsubmod;
2861         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2862         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2863         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2864         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2865         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2866         for (x = startx; x < endx;)
2867         {
2868                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2869                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2870                 if(nextsub >= endx)
2871                 {
2872                         nextsub = endsub = endx-1;
2873                         if(x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2874                 }
2875                 mod = endmod;
2876                 submod = endsubmod;
2877                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2878                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2879                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2880                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2881                 substep = _mm_packs_epi32(substep, substep);
2882                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2883                 {
2884                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2885                         pix = _mm_mulhi_epu16(pix, submod);
2886                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2887                 }
2888                 if (x <= endsub)
2889                 {
2890                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2891                         pix = _mm_mulhi_epu16(pix, submod);
2892                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2893                         x++;
2894                 }
2895         }
2896 #endif
2897 }
2898
2899 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2900 {
2901 #ifdef SSE2_PRESENT
2902         int x;
2903         int startx = span->startx;
2904         int endx = span->endx;
2905         __m128 data, slope;
2906         __m128 mod, endmod;
2907         __m128i submod, substep, endsubmod;
2908         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2909         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2910         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2911         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2912         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2913         for (x = startx; x < endx;)
2914         {
2915                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2916                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2917                 if(nextsub >= endx)
2918                 {
2919                         nextsub = endsub = endx-1;
2920                         if(x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2921                 }
2922                 mod = endmod;
2923                 submod = endsubmod;
2924                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2925                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2926                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2927                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2928                 substep = _mm_packs_epi32(substep, substep);
2929                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2930                 {
2931                         __m128i pix = _mm_srai_epi16(submod, 4);
2932                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2933                 }
2934                 if (x <= endsub)
2935                 {
2936                         __m128i pix = _mm_srai_epi16(submod, 4);
2937                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2938                         x++;
2939                 }
2940         }
2941 #endif
2942 }
2943
2944 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2945 {
2946 #ifdef SSE2_PRESENT
2947         int x, startx = span->startx, endx = span->endx;
2948         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2949         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2950         for (x = startx;x+2 <= endx;x+=2)
2951         {
2952                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2953                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2954                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2955                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2956         }
2957         if(x < endx)
2958         {
2959                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2960                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2961                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2962                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2963         }
2964 #endif
2965 }
2966
2967 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2968 {
2969 #ifdef SSE2_PRESENT
2970         int x, startx = span->startx, endx = span->endx;
2971         for (x = startx;x+2 <= endx;x+=2)
2972         {
2973                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2974                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2975                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2976                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2977         }
2978         if(x < endx)
2979         {
2980                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2981                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2982                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2983                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2984         }
2985 #endif
2986 }
2987
2988 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2989 {
2990 #ifdef SSE2_PRESENT
2991         int x, startx = span->startx, endx = span->endx;
2992         for (x = startx;x+2 <= endx;x+=2)
2993         {
2994                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2995                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2996                 pix1 = _mm_add_epi16(pix1, pix2);
2997                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2998         }
2999         if(x < endx)
3000         {
3001                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3002                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3003                 pix1 = _mm_add_epi16(pix1, pix2);
3004                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3005         }
3006 #endif
3007 }
3008
3009 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3010 {
3011 #ifdef SSE2_PRESENT
3012         int x, startx = span->startx, endx = span->endx;
3013         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3014         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3015         for (x = startx;x+2 <= endx;x+=2)
3016         {
3017                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3018                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3019                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3020                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3021         }
3022         if(x < endx)
3023         {
3024                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3025                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3026                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3027                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3028         }
3029 #endif
3030 }
3031
3032 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3033 {
3034 #ifdef SSE2_PRESENT
3035         int x, startx = span->startx, endx = span->endx;
3036         for (x = startx;x+2 <= endx;x+=2)
3037         {
3038                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3039                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3040                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3041                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3042                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3043         }
3044         if(x < endx)
3045         {
3046                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3047                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3048                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3049                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3050                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3051         }
3052 #endif
3053 }
3054
3055 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3056 {
3057 #ifdef SSE2_PRESENT
3058         int x, startx = span->startx, endx = span->endx;
3059         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3060         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3061         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3062         for (x = startx;x+2 <= endx;x+=2)
3063         {
3064                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3065                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3066                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3067         }
3068         if(x < endx)
3069         {
3070                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3071                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3072                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3073         }
3074 #endif
3075 }
3076
3077
3078
3079 void DPSOFTRAST_VertexShader_Generic(void)
3080 {
3081         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3082         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3083         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3084         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3085                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3086 }
3087
3088 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3089 {
3090         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3091         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3092         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3093         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3094         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3095         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3096         {
3097                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3098                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3099                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3100                 {
3101                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3102                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3103                         {
3104                                 // multiply
3105                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3106                         }
3107                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3108                         {
3109                                 // add
3110                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3111                         }
3112                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3113                         {
3114                                 // alphablend
3115                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3116                         }
3117                 }
3118         }
3119         else
3120                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3121         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3122 }
3123
3124
3125
3126 void DPSOFTRAST_VertexShader_PostProcess(void)
3127 {
3128         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3129         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3130         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3131 }
3132
3133 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3134 {
3135         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3136         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3137         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3138         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3139         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3140         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3141         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3142         {
3143                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3144                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3145         }
3146         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3147         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3148         {
3149                 // TODO: implement saturation
3150         }
3151         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3152         {
3153                 // TODO: implement gammaramps
3154         }
3155         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3156 }
3157
3158
3159
3160 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3161 {
3162         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3163 }
3164
3165 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3166 {
3167         // this is never called (because colormask is off when this shader is used)
3168         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3169         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3170         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3171         memset(buffer_FragColorbgra8, 0, span->length*4);
3172         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3173 }
3174
3175
3176
3177 void DPSOFTRAST_VertexShader_FlatColor(void)
3178 {
3179         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3180         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3181 }
3182
3183 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3184 {
3185         int x, startx = span->startx, endx = span->endx;
3186         int Color_Ambienti[4];
3187         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3188         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3189         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3190         Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3191         Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3192         Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3193         Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]        *256.0f);
3194         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3195         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3196         for (x = startx;x < endx;x++)
3197         {
3198                 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3199                 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3200                 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3201                 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3202         }
3203         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3204 }
3205
3206
3207
3208 void DPSOFTRAST_VertexShader_VertexColor(void)
3209 {
3210         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3211         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3212         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3213 }
3214
3215 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3216 {
3217 #ifdef SSE2_PRESENT
3218         unsigned char * RESTRICT pixelmask = span->pixelmask;
3219         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3220         int x, startx = span->startx, endx = span->endx;
3221         __m128i Color_Ambientm, Color_Diffusem;
3222         __m128 data, slope;
3223         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3224         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3225         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3226         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3227         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3228         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3229         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3230                 pixel = buffer_FragColorbgra8;
3231         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3232         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3233         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3234         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3235         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3236         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3237         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3238         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3239         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3240         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3241         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3242         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3243         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3244         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3245         {
3246                 __m128i color, mod, pix;
3247                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3248                 {
3249                         __m128i pix2, mod2;
3250                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3251                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3252                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3253                         data = _mm_add_ps(data, slope);
3254                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3255                         data = _mm_add_ps(data, slope);
3256                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3257                         data = _mm_add_ps(data, slope);
3258                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3259                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3260                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3261                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3262                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3263                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3264                         x += 3;
3265                         continue;
3266                 }
3267                 if(!pixelmask[x])
3268                         continue;
3269                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3270                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3271                 mod = _mm_packs_epi32(mod, mod);
3272                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3273                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3274         }
3275         if(pixel == buffer_FragColorbgra8)
3276                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3277 #endif
3278 }
3279
3280
3281
3282 void DPSOFTRAST_VertexShader_Lightmap(void)
3283 {
3284         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3285         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3286         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3287 }
3288
3289 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3290 {
3291 #ifdef SSE2_PRESENT
3292         unsigned char * RESTRICT pixelmask = span->pixelmask;
3293         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3294         int x, startx = span->startx, endx = span->endx;
3295         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3296         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3297         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3298         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3299         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3300         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3301         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3302         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3303         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3304         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3305                 pixel = buffer_FragColorbgra8;
3306         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3307         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3308         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3309         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3310         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3311         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3312         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3313         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3314         {
3315                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3316                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3317                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3318                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3319                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3320                 for (x = startx;x < endx;x++)
3321                 {
3322                         __m128i color, lightmap, glow, pix;
3323                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3324                         {
3325                                 __m128i pix2;
3326                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3327                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3328                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3329                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3330                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3331                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3332                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3333                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3334                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3335                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3336                                 x += 3;
3337                                 continue;
3338                         }
3339                         if(!pixelmask[x])
3340                                 continue;
3341                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3342                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3343                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3344                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3345                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3346                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3347                 }
3348         }
3349         else
3350         {
3351                 for (x = startx;x < endx;x++)
3352                 {
3353                         __m128i color, lightmap, pix;
3354                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3355                         {
3356                                 __m128i pix2;
3357                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3358                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3359                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3360                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3361                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3362                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3363                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3364                                 x += 3;
3365                                 continue;
3366                         }
3367                         if(!pixelmask[x]) 
3368                                 continue;
3369                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3370                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3371                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3372                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3373                 }
3374         }
3375         if(pixel == buffer_FragColorbgra8)
3376                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3377 #endif
3378 }
3379
3380
3381
3382 void DPSOFTRAST_VertexShader_FakeLight(void)
3383 {
3384         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3385 }
3386
3387 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3388 {
3389         // TODO: IMPLEMENT
3390         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3391         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3392         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3393         memset(buffer_FragColorbgra8, 0, span->length*4);
3394         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3395 }
3396
3397
3398
3399 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3400 {
3401         DPSOFTRAST_VertexShader_Lightmap();
3402 }
3403
3404 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3405 {
3406         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3407         // TODO: IMPLEMENT
3408 }
3409
3410
3411
3412 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3413 {
3414         DPSOFTRAST_VertexShader_Lightmap();
3415 }
3416
3417 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3418 {
3419         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3420         // TODO: IMPLEMENT
3421 }
3422
3423
3424
3425 void DPSOFTRAST_VertexShader_LightDirection(void)
3426 {
3427         int i;
3428         int numvertices = dpsoftrast.numvertices;
3429         float LightDir[4];
3430         float LightVector[4];
3431         float EyePosition[4];
3432         float EyeVectorModelSpace[4];
3433         float EyeVector[4];
3434         float position[4];
3435         float svector[4];
3436         float tvector[4];
3437         float normal[4];
3438         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3439         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3440         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3441         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3442         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3443         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3444         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3445         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3446         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3447         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3448         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3449         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3450         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3451         for (i = 0;i < numvertices;i++)
3452         {
3453                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3454                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3455                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3456                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3457                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3458                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3459                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3460                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3461                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3462                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3463                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3464                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3465                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3466                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3467                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3468                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3469                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3470                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3471                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3472                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3473                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3474                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3475                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3476                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3477                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3478                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3479                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3480                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3481                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3482         }
3483         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3484 }
3485
3486 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3487 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3488 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3489 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3490 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3491 #define DPSOFTRAST_Vector3Normalize(v)\
3492 do\
3493 {\
3494         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3495         if (len)\
3496         {\
3497                 len = 1.0f / len;\
3498                 v[0] *= len;\
3499                 v[1] *= len;\
3500                 v[2] *= len;\
3501         }\
3502 }\
3503 while(0)
3504
3505 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3506 {
3507         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3508         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3509         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3510         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3511         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3512         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3513         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3514         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3515         int x, startx = span->startx, endx = span->endx;
3516         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3517         float LightVectordata[4];
3518         float LightVectorslope[4];
3519         float EyeVectordata[4];
3520         float EyeVectorslope[4];
3521         float z;
3522         float diffusetex[4];
3523         float glosstex[4];
3524         float surfacenormal[4];
3525         float lightnormal[4];
3526         float eyenormal[4];
3527         float specularnormal[4];
3528         float diffuse;
3529         float specular;
3530         float SpecularPower;
3531         int d[4];
3532         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3533         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3534         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3535         Color_Glow[3] = 0.0f;
3536         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3537         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3538         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3539         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3540         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3541         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3542         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3543         Color_Pants[3] = 0.0f;
3544         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3545         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3546         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3547         Color_Shirt[3] = 0.0f;
3548         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3549         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3550         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3551         {
3552                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3553                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3554         }
3555         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3556         {
3557                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3558         }
3559         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3560         {
3561                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3562                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3563                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3564                 Color_Diffuse[3] = 0.0f;
3565                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3566                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3567                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3568                 LightColor[3] = 0.0f;
3569                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3570                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3571                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3572                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3573                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3574                 Color_Specular[3] = 0.0f;
3575                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3576                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3577                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3578                 for (x = startx;x < endx;x++)
3579                 {
3580                         z = buffer_z[x];
3581                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3582                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3583                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3584                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3585                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3586                         {
3587                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3588                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3589                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3590                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3591                         }
3592                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3593                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3594                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3595                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3596                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3597                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3598                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3599                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3600
3601                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3602                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3603                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3604                         DPSOFTRAST_Vector3Normalize(lightnormal);
3605
3606                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3607                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3608                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3609                         DPSOFTRAST_Vector3Normalize(eyenormal);
3610
3611                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3612                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3613                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3614                         DPSOFTRAST_Vector3Normalize(specularnormal);
3615
3616                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3617                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3618                         specular = pow(specular, SpecularPower * glosstex[3]);
3619                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3620                         {
3621                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3622                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3623                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3624                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3625                         }
3626                         else
3627                         {
3628                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3629                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3630                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3631                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3632                         }
3633                         buffer_FragColorbgra8[x*4+0] = d[0];
3634                         buffer_FragColorbgra8[x*4+1] = d[1];
3635                         buffer_FragColorbgra8[x*4+2] = d[2];
3636                         buffer_FragColorbgra8[x*4+3] = d[3];
3637                 }
3638         }
3639         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3640         {
3641                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3642                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3643                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3644                 Color_Diffuse[3] = 0.0f;
3645                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3646                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3647                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3648                 LightColor[3] = 0.0f;
3649                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3650                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3651                 for (x = startx;x < endx;x++)
3652                 {
3653                         z = buffer_z[x];
3654                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3655                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3656                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3657                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3658                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3659                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3660                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3661                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3662
3663                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3664                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3665                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3666                         DPSOFTRAST_Vector3Normalize(lightnormal);
3667
3668                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3669                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3670                         {
3671                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3672                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3673                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3674                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3675                         }
3676                         else
3677                         {
3678                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3679                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3680                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3681                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3682                         }
3683                         buffer_FragColorbgra8[x*4+0] = d[0];
3684                         buffer_FragColorbgra8[x*4+1] = d[1];
3685                         buffer_FragColorbgra8[x*4+2] = d[2];
3686                         buffer_FragColorbgra8[x*4+3] = d[3];
3687                 }
3688         }
3689         else
3690         {
3691                 for (x = startx;x < endx;x++)
3692                 {
3693                         z = buffer_z[x];
3694                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3695                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3696                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3697                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3698
3699                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3700                         {
3701                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3702                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3703                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3704                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3705                         }
3706                         else
3707                         {
3708                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3709                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3710                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3711                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3712                         }
3713                         buffer_FragColorbgra8[x*4+0] = d[0];
3714                         buffer_FragColorbgra8[x*4+1] = d[1];
3715                         buffer_FragColorbgra8[x*4+2] = d[2];
3716                         buffer_FragColorbgra8[x*4+3] = d[3];
3717                 }
3718         }
3719         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3720 }
3721
3722
3723
3724 void DPSOFTRAST_VertexShader_LightSource(void)
3725 {
3726         int i;
3727         int numvertices = dpsoftrast.numvertices;
3728         float LightPosition[4];
3729         float LightVector[4];
3730         float LightVectorModelSpace[4];
3731         float EyePosition[4];
3732         float EyeVectorModelSpace[4];
3733         float EyeVector[4];
3734         float position[4];
3735         float svector[4];
3736         float tvector[4];
3737         float normal[4];
3738         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3739         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3740         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3741         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3742         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3743         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3744         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3745         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3746         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3747         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3748         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3749         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3750         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3751         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3752         for (i = 0;i < numvertices;i++)
3753         {
3754                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3755                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3756                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3757                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3758                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3759                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3760                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3761                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3762                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3763                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3764                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3765                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3766                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3767                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3768                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3769                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3770                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3771                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3772                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3773                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3774                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3775                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3776                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3777                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3778                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3779                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3780                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3781                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3782                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3783                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3784                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3785                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3786         }
3787         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3788         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3789 }
3790
3791 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3792 {
3793 #ifdef SSE2_PRESENT
3794         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3795         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3796         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3797         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3798         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3799         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3800         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3801         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3802         int x, startx = span->startx, endx = span->endx;
3803         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3804         float CubeVectordata[4];
3805         float CubeVectorslope[4];
3806         float LightVectordata[4];
3807         float LightVectorslope[4];
3808         float EyeVectordata[4];
3809         float EyeVectorslope[4];
3810         float z;
3811         float diffusetex[4];
3812         float glosstex[4];
3813         float surfacenormal[4];
3814         float lightnormal[4];
3815         float eyenormal[4];
3816         float specularnormal[4];
3817         float diffuse;
3818         float specular;
3819         float SpecularPower;
3820         float CubeVector[4];
3821         float attenuation;
3822         int d[4];
3823         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3824         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3825         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3826         Color_Glow[3] = 0.0f;
3827         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3828         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3829         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3830         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3831         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3832         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3833         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3834         Color_Diffuse[3] = 0.0f;
3835         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3836         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3837         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3838         Color_Specular[3] = 0.0f;
3839         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3840         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3841         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3842         Color_Pants[3] = 0.0f;
3843         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3844         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3845         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3846         Color_Shirt[3] = 0.0f;
3847         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3848         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3849         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3850         LightColor[3] = 0.0f;
3851         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3852         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3853         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3854         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3855         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3856         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3857         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3858         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3859         {
3860                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3861                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3862         }
3863         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3864                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3865         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3866         {
3867                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3868                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3869                 for (x = startx;x < endx;x++)
3870                 {
3871                         z = buffer_z[x];
3872                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3873                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3874                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3875                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3876                         if (attenuation < 0.01f)
3877                                 continue;
3878                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3879                         {
3880                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3881                                 if (attenuation < 0.01f)
3882                                         continue;
3883                         }
3884
3885                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3886                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3887                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3888                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3889                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3890                         {
3891                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3892                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3893                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3894                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3895                         }
3896                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3897                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3898                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3899                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3900                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3901                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3902                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3903                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3904
3905                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3906                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3907                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3908                         DPSOFTRAST_Vector3Normalize(lightnormal);
3909
3910                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3911                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3912                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3913                         DPSOFTRAST_Vector3Normalize(eyenormal);
3914
3915                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3916                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3917                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3918                         DPSOFTRAST_Vector3Normalize(specularnormal);
3919
3920                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3921                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3922                         specular = pow(specular, SpecularPower * glosstex[3]);
3923                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3924                         {
3925                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3926                                 attenuation *= (1.0f / 255.0f);
3927                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3928                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3929                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3930                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3931                         }
3932                         else
3933                         {
3934                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3935                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3936                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3937                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3938                         }
3939                         buffer_FragColorbgra8[x*4+0] = d[0];
3940                         buffer_FragColorbgra8[x*4+1] = d[1];
3941                         buffer_FragColorbgra8[x*4+2] = d[2];
3942                         buffer_FragColorbgra8[x*4+3] = d[3];
3943                 }
3944         }
3945         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3946         {
3947                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3948                 for (x = startx;x < endx;x++)
3949                 {
3950                         z = buffer_z[x];
3951                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3952                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3953                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3954                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3955                         if (attenuation < 0.01f)
3956                                 continue;
3957                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3958                         {
3959                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3960                                 if (attenuation < 0.01f)
3961                                         continue;
3962                         }
3963
3964                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3965                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3966                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3967                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3968                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3969                         {
3970                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3971                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3972                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3973                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3974                         }
3975                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3976                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3977                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3978                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3979
3980                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3981                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3982                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3983                         DPSOFTRAST_Vector3Normalize(lightnormal);
3984
3985                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3986                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3987                         {
3988                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3989                                 attenuation *= (1.0f / 255.0f);
3990                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3991                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3992                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3993                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
3994                         }
3995                         else
3996                         {
3997                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3998                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3999                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4000                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4001                         }
4002                         buffer_FragColorbgra8[x*4+0] = d[0];
4003                         buffer_FragColorbgra8[x*4+1] = d[1];
4004                         buffer_FragColorbgra8[x*4+2] = d[2];
4005                         buffer_FragColorbgra8[x*4+3] = d[3];
4006                 }
4007         }
4008         else
4009         {
4010                 for (x = startx;x < endx;x++)
4011                 {
4012                         z = buffer_z[x];
4013                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4014                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4015                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4016                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4017                         if (attenuation < 0.01f)
4018                                 continue;
4019                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4020                         {
4021                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4022                                 if (attenuation < 0.01f)
4023                                         continue;
4024                         }
4025
4026                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4027                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4028                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4029                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4030                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4031                         {
4032                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4033                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4034                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4035                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4036                         }
4037                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4038                         {
4039                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4040                                 attenuation *= (1.0f / 255.0f);
4041                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4042                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4043                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4044                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4045                         }
4046                         else
4047                         {
4048                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4049                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4050                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4051                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4052                         }
4053                         buffer_FragColorbgra8[x*4+0] = d[0];
4054                         buffer_FragColorbgra8[x*4+1] = d[1];
4055                         buffer_FragColorbgra8[x*4+2] = d[2];
4056                         buffer_FragColorbgra8[x*4+3] = d[3];
4057                 }
4058         }
4059         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4060 #endif
4061 }
4062
4063
4064
4065 void DPSOFTRAST_VertexShader_Refraction(void)
4066 {
4067         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4068 }
4069
4070 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4071 {
4072         // TODO: IMPLEMENT
4073         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4074         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4075         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4076         memset(buffer_FragColorbgra8, 0, span->length*4);
4077         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4078 }
4079
4080
4081
4082 void DPSOFTRAST_VertexShader_Water(void)
4083 {
4084         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4085 }
4086
4087
4088 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4089 {
4090         // TODO: IMPLEMENT
4091         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4092         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4093         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4094         memset(buffer_FragColorbgra8, 0, span->length*4);
4095         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4096 }
4097
4098
4099
4100 void DPSOFTRAST_VertexShader_ShowDepth(void)
4101 {
4102         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4103 }
4104
4105 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4106 {
4107         // TODO: IMPLEMENT
4108         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4109         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4110         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4111         memset(buffer_FragColorbgra8, 0, span->length*4);
4112         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4113 }
4114
4115
4116
4117 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4118 {
4119         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4120 }
4121
4122 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4123 {
4124         // TODO: IMPLEMENT
4125         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4126         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4127         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4128         memset(buffer_FragColorbgra8, 0, span->length*4);
4129         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4130 }
4131
4132
4133
4134 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4135 {
4136         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4137 }
4138
4139 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4140 {
4141         // TODO: IMPLEMENT
4142         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4143         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4144         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4145         memset(buffer_FragColorbgra8, 0, span->length*4);
4146         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4147 }
4148
4149
4150
4151 typedef struct DPSOFTRAST_ShaderModeInfo_s
4152 {
4153         int lodarrayindex;
4154         void (*Vertex)(void);
4155         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4156         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4157         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4158 }
4159 DPSOFTRAST_ShaderModeInfo;
4160
4161 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4162 {
4163         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4164         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4165         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4166         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4167         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4168         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4169         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4170         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4171         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4172         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4173         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4174         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4175         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4176         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4177         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4178         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
4179 };
4180
4181 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4182 {
4183         int i;
4184         int x;
4185         int startx;
4186         int endx;
4187 //      unsigned int c;
4188 //      unsigned int *colorpixel;
4189         unsigned int *depthpixel;
4190         float w;
4191         float wslope;
4192         int depth;
4193         int depthslope;
4194         unsigned int d;
4195         DPSOFTRAST_State_Triangle *triangle;
4196         DPSOFTRAST_State_Span *span;
4197         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4198         for (i = 0; i < thread->numspans; i++)
4199         {
4200                 span = &thread->spans[i];
4201                 triangle = &thread->triangles[span->triangle];
4202                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4203                 {
4204                         wslope = triangle->w[0];
4205                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4206                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4207                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4208                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4209                         switch(thread->fb_depthfunc)
4210                         {
4211                         default:
4212                         case GL_ALWAYS:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4213                         case GL_LESS:    for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4214                         case GL_LEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4215                         case GL_EQUAL:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4216                         case GL_GEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4217                         case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4218                         case GL_NEVER:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4219                         }
4220                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4221                         //for (x = 0;x < span->length;x++)
4222                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4223                         // if there is no color buffer, skip pixel shader
4224                         startx = 0;
4225                         endx = span->length;
4226                         while (startx < endx && !pixelmask[startx])
4227                                 startx++;
4228                         while (endx > startx && !pixelmask[endx-1])
4229                                 endx--;
4230                         if (startx >= endx)
4231                                 continue; // no pixels to fill
4232                         span->pixelmask = pixelmask;
4233                         span->startx = startx;
4234                         span->endx = endx;
4235                         // run pixel shader if appropriate
4236                         // do this before running depthmask code, to allow the pixelshader
4237                         // to clear pixelmask values for alpha testing
4238                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4239                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4240                         if (thread->depthmask)
4241                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4242                                         if (pixelmask[x])
4243                                                 depthpixel[x] = d;
4244                 }
4245                 else
4246                 {
4247                         // no depth testing means we're just dealing with color...
4248                         // if there is no color buffer, skip pixel shader
4249                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4250                         {
4251                                 memset(pixelmask, 1, span->length);
4252                                 span->pixelmask = pixelmask;
4253                                 span->startx = 0;
4254                                 span->endx = span->length;
4255                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4256                         }
4257                 }
4258         }
4259         thread->numspans = 0;
4260 }
4261
4262 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4263
4264 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4265 {
4266 #ifdef SSE2_PRESENT
4267         int cullface = thread->cullface;
4268         int width = dpsoftrast.fb_width;
4269         int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4270         int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4271         __m128i fbmin, fbmax;
4272         __m128 viewportcenter, viewportscale;
4273         int firstvertex = command->firstvertex;
4274         int numvertices = command->numvertices;
4275         int numtriangles = command->numtriangles;
4276         const int *element3i = command->element3i;
4277         const unsigned short *element3s = command->element3s;
4278         int clipped = command->clipped;
4279         int i;
4280         int j;
4281         int k;
4282         int y;
4283         int e[3];
4284         __m128i screeny;
4285         int starty, endy;
4286         int numpoints;
4287         int clipcase;
4288         float clipdist[4];
4289         __m128 triangleedge1, triangleedge2, trianglenormal;
4290         __m128 clipfrac[3];
4291         __m128 screen[4];
4292         DPSOFTRAST_State_Triangle *triangle;
4293         DPSOFTRAST_Texture *texture;
4294         if (command->starty >= maxy || command->endy <= miny)
4295         {
4296                 if (!ATOMIC_DECREMENT(command->refcount))
4297                 {
4298                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4299                                 MM_FREE(command->arrays);
4300                 }
4301                 return;
4302         }
4303         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4304         fbmin = _mm_setr_epi16(0, miny, 0, miny, 0, miny, 0, miny);
4305         fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy, width, maxy, width, maxy, width, maxy), _mm_set1_epi16(1));
4306         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4307         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4308         screen[3] = _mm_setzero_ps();
4309         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4310         for (i = 0;i < numtriangles;i++)
4311         {
4312                 const float *screencoord4f = command->arrays;
4313                 const float *arrays = screencoord4f + numvertices*4;
4314
4315                 // generate the 3 edges of this triangle
4316                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4317                 if (element3s)
4318                 {
4319                         e[0] = element3s[i*3+0] - firstvertex;
4320                         e[1] = element3s[i*3+1] - firstvertex;
4321                         e[2] = element3s[i*3+2] - firstvertex;
4322                 }
4323                 else if (element3i)
4324                 {
4325                         e[0] = element3i[i*3+0] - firstvertex;
4326                         e[1] = element3i[i*3+1] - firstvertex;
4327                         e[2] = element3i[i*3+2] - firstvertex;
4328                 }
4329                 else
4330                 {
4331                         e[0] = i*3+0;
4332                         e[1] = i*3+1;
4333                         e[2] = i*3+2;
4334                 }
4335
4336 #define SKIPBACKFACE \
4337                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4338                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4339                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4340                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4341                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4342                 switch(cullface) \
4343                 { \
4344                 case GL_BACK: \
4345                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4346                                 continue; \
4347                         break; \
4348                 case GL_FRONT: \
4349                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4350                                 continue; \
4351                         break; \
4352                 }
4353
4354 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4355                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4356                         { \
4357                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4358                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4359                         }
4360 #define CLIPPEDVERTEXCOPY(k,p1) \
4361                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4362
4363 #define GENATTRIBCOPY(attrib, p1) \
4364                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4365 #define GENATTRIBLERP(attrib, p1, p2) \
4366                 { \
4367                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4368                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4369                 }
4370 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4371                 switch(clipcase) \
4372                 { \
4373                 default: \
4374                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4375                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4376                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4377                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4378                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4379                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4380                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4381                 }
4382
4383                 if (! clipped)
4384                         goto notclipped;
4385
4386                 // calculate distance from nearplane
4387                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4388                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4389                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4390                 if (clipdist[0] >= 0.0f)
4391                 {
4392                         if (clipdist[1] >= 0.0f)
4393                         {
4394                                 if (clipdist[2] >= 0.0f)
4395                                 {
4396                                 notclipped:
4397                                         // triangle is entirely in front of nearplane
4398                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4399                                         SKIPBACKFACE;
4400                                         numpoints = 3;
4401                                         clipcase = 0;
4402                                 }
4403                                 else
4404                                 {
4405                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4406                                         SKIPBACKFACE;
4407                                         numpoints = 4;
4408                                         clipcase = 1;
4409                                 }
4410                         }
4411                         else
4412                         {
4413                                 if (clipdist[2] >= 0.0f)
4414                                 {
4415                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4416                                         SKIPBACKFACE;
4417                                         numpoints = 4;
4418                                         clipcase = 2;
4419                                 }
4420                                 else
4421                                 {
4422                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4423                                         SKIPBACKFACE;
4424                                         numpoints = 3;
4425                                         clipcase = 3;
4426                                 }
4427                         }
4428                 }
4429                 else if (clipdist[1] >= 0.0f)
4430                 {
4431                         if (clipdist[2] >= 0.0f)
4432                         {
4433                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4434                                 SKIPBACKFACE;
4435                                 numpoints = 4;
4436                                 clipcase = 4;
4437                         }
4438                         else
4439                         {
4440                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4441                                 SKIPBACKFACE;
4442                                 numpoints = 3;
4443                                 clipcase = 5;
4444                         }
4445                 }
4446                 else if (clipdist[2] >= 0.0f)
4447                 {
4448                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4449                         SKIPBACKFACE;
4450                         numpoints = 3;
4451                         clipcase = 6;
4452                 }
4453                 else continue; // triangle is entirely behind nearplane
4454
4455                 {
4456                         // calculate integer y coords for triangle points
4457                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4458                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4459                                         screenmin = _mm_min_epi16(screeni, screenir),
4460                                         screenmax = _mm_max_epi16(screeni, screenir);
4461                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4462                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4463                         screenmin = _mm_max_epi16(screenmin, fbmin);
4464                         screenmax = _mm_min_epi16(screenmax, fbmax);
4465                         // skip offscreen triangles
4466                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4467                                 continue;
4468                         starty = _mm_extract_epi16(screenmin, 1);
4469                         endy = _mm_extract_epi16(screenmax, 1)+1;
4470                         screeny = _mm_srai_epi32(screeni, 16);
4471                 }
4472
4473                 triangle = &thread->triangles[thread->numtriangles];
4474
4475                 // calculate attribute plans for triangle data...
4476                 // okay, this triangle is going to produce spans, we'd better project
4477                 // the interpolants now (this is what gives perspective texturing),
4478                 // this consists of simply multiplying all arrays by the W coord
4479                 // (which is basically 1/Z), which will be undone per-pixel
4480                 // (multiplying by Z again) to get the perspective-correct array
4481                 // values
4482                 {
4483                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4484                         __m128 mipedgescale, mipdensity;
4485                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4486                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4487                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4488                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4489                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4490                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4491                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4492                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4493                         attribedge1 = _mm_sub_ss(w0, w1);
4494                         attribedge2 = _mm_sub_ss(w2, w1);
4495                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4496                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4497                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4498                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4499                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4500                         _mm_store_ss(&triangle->w[0], attribxslope);
4501                         _mm_store_ss(&triangle->w[1], attribyslope);
4502                         _mm_store_ss(&triangle->w[2], attriborigin);
4503                         mipedgescale = _mm_setzero_ps();
4504                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4505                         {
4506                                 __m128 attrib0, attrib1, attrib2;
4507                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4508                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4509                                         break;
4510                                 arrays += numvertices*4;
4511                                 GENATTRIBS(attrib0, attrib1, attrib2);
4512                                 attriborigin = _mm_mul_ps(attrib1, w1);
4513                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4514                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4515                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4516                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4517                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4518                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4519                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4520                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4521                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4522                                 {
4523                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4524                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4525                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4526                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4527                                 }
4528                         }
4529
4530                         memset(triangle->mip, 0, sizeof(triangle->mip));
4531                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4532                         {
4533                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4534                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4535                                         break;
4536                                 texture = thread->texbound[texunit];
4537                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4538                                 {
4539                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4540                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4541                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4542                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4543                                         // this will be multiplied in the texturing routine by the texture resolution
4544                                         y = _mm_cvtss_si32(mipdensity);
4545                                         if (y > 0)
4546                                         {
4547                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4548                                                 if (y > texture->mipmaps - 1)
4549                                                         y = texture->mipmaps - 1;
4550                                                 triangle->mip[texunit] = y;
4551                                         }
4552                                 }
4553                         }
4554                 }
4555
4556                 for (y = starty; y < endy;)
4557                 {
4558                         __m128 xcoords, xslope;
4559                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4560                         int yccmask = _mm_movemask_epi8(ycc);
4561                         int edge0p, edge0n, edge1p, edge1n;
4562                         int nexty;
4563                         if (numpoints == 4)
4564                         {
4565                                 switch(yccmask)
4566                                 {
4567                                 default:
4568                                 case 0xFFFF: /*0000*/ y = endy; continue;
4569                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4570                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4571                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4572                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4573                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4574                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4575                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4576                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4577                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4578                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4579                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4580                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4581                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4582                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4583                                 case 0x0000: /*1111*/ y++; continue;
4584                                 }
4585                         }
4586                         else
4587                         {
4588                                 switch(yccmask)
4589                                 {
4590                                 default:
4591                                 case 0xFFFF: /*000*/ y = endy; continue;
4592                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4593                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4594                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4595                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4596                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4597                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4598                                 case 0x0000: /*111*/ y++; continue;
4599                                 }
4600                         }
4601                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4602                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4603                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4604                         nexty = _mm_extract_epi16(ycc, 0);
4605                         if(nexty >= endy) nexty = endy-1;
4606                         if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4607                         {
4608                                 int tmp = edge0n;
4609                                 edge0n = edge1n;
4610                                 edge1n = tmp;
4611                                 tmp = edge0p;
4612                                 edge0p = edge1p;
4613                                 edge1p = tmp;
4614                         }
4615                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4616                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4617                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4618                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4619                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4620                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4621                         {
4622                                 int startx, endx, offset;
4623                                 startx = _mm_cvtss_si32(xcoords);
4624                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4625                                 if (startx < 0) startx = 0;
4626                                 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4627                                 if (startx >= endx) continue;
4628                                 for (offset = startx; offset < endx;)
4629                                 {
4630                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4631                                         span->triangle = thread->numtriangles;
4632                                         span->x = offset;
4633                                         span->y = y;
4634                                         span->length = endx - offset;
4635                                         if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4636                                                 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4637                                         offset += span->length;
4638                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4639                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4640                                 }
4641                         }
4642                 }
4643
4644                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4645                 {
4646                         DPSOFTRAST_Draw_ProcessSpans(thread);
4647                         thread->numtriangles = 0;
4648                 }
4649         }
4650
4651         if (!ATOMIC_DECREMENT(command->refcount))
4652         {
4653                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4654                         MM_FREE(command->arrays);
4655         }
4656
4657         if (thread->numspans > 0 || thread->numtriangles > 0)
4658         {
4659                 DPSOFTRAST_Draw_ProcessSpans(thread);
4660                 thread->numtriangles = 0;
4661         }
4662 #endif
4663 }
4664
4665 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4666 {
4667         int i;
4668         int j;
4669         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4670         int datasize = 2*numvertices*sizeof(float[4]);
4671         DPSOFTRAST_Command_Draw *command;
4672         unsigned char *data;
4673         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4674         {
4675                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4676                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4677                         break;
4678                 datasize += numvertices*sizeof(float[4]);
4679         }
4680         if (element3s)
4681                 datasize += numtriangles*sizeof(unsigned short[3]);
4682         else if (element3i)
4683                 datasize += numtriangles*sizeof(int[3]);
4684         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4685         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4686         {
4687                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4688                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4689         }
4690         else
4691         {
4692                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4693                 data = (unsigned char *)command + commandsize;
4694         }
4695         command->firstvertex = firstvertex;
4696         command->numvertices = numvertices;
4697         command->numtriangles = numtriangles;
4698         command->arrays = (float *)data;
4699         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4700         dpsoftrast.firstvertex = firstvertex;
4701         dpsoftrast.numvertices = numvertices;
4702         dpsoftrast.screencoord4f = (float *)data;
4703         data += numvertices*sizeof(float[4]);
4704         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4705         data += numvertices*sizeof(float[4]);
4706         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4707         {
4708                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4709                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4710                         break;
4711                 dpsoftrast.post_array4f[j] = (float *)data;
4712                 data += numvertices*sizeof(float[4]);
4713         }
4714         command->element3i = NULL;
4715         command->element3s = NULL;
4716         if (element3s)
4717         {
4718                 command->element3s = (unsigned short *)data;
4719                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4720         }
4721         else if (element3i)
4722         {
4723                 command->element3i = (int *)data;
4724                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4725         }
4726         return command;
4727 }
4728
4729 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4730 {
4731         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4732         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4733         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4734         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4735         if (command->starty >= command->endy)
4736         {
4737                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4738                         MM_FREE(command->arrays);
4739                 DPSOFTRAST_UndoCommand(command->commandsize);
4740                 return;
4741         }
4742         command->clipped = dpsoftrast.drawclipped;
4743         command->refcount = dpsoftrast.numthreads;
4744
4745 #ifdef USE_THREADS
4746         DPSOFTRAST_Draw_SyncCommands();
4747         {
4748                 int i;
4749                 int nexty = 0;
4750                 for (i = 0; i < dpsoftrast.numthreads; i++)
4751                 {
4752                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4753                         int y = nexty;
4754                         nexty = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4755                         if (command->starty < nexty && command->endy > y && thread->starving)
4756                                 SDL_CondSignal(thread->drawcond);
4757                 }
4758         }
4759 #else
4760         DPSOFTRAST_Draw_FlushThreads();
4761 #endif
4762 }
4763  
4764 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4765 {
4766         int commandoffset = thread->commandoffset;
4767         while (commandoffset != endoffset)
4768         {
4769                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4770                 switch (command->opcode)
4771                 {
4772 #define INTERPCOMMAND(name) \
4773                 case DPSOFTRAST_OPCODE_##name : \
4774                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4775                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4776                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4777                                 commandoffset = 0; \
4778                         break;
4779                 INTERPCOMMAND(Viewport)
4780                 INTERPCOMMAND(ClearColor)
4781                 INTERPCOMMAND(ClearDepth)
4782                 INTERPCOMMAND(ColorMask)
4783                 INTERPCOMMAND(DepthTest)
4784                 INTERPCOMMAND(ScissorTest)
4785                 INTERPCOMMAND(Scissor)
4786                 INTERPCOMMAND(BlendFunc)
4787                 INTERPCOMMAND(BlendSubtract)
4788                 INTERPCOMMAND(DepthMask)
4789                 INTERPCOMMAND(DepthFunc)
4790                 INTERPCOMMAND(DepthRange)
4791                 INTERPCOMMAND(PolygonOffset)
4792                 INTERPCOMMAND(CullFace)
4793                 INTERPCOMMAND(AlphaTest)
4794                 INTERPCOMMAND(AlphaFunc)
4795                 INTERPCOMMAND(SetTexture)
4796                 INTERPCOMMAND(SetShader)
4797                 INTERPCOMMAND(Uniform4f)
4798                 INTERPCOMMAND(UniformMatrix4f)
4799                 INTERPCOMMAND(Uniform1i)
4800
4801                 case DPSOFTRAST_OPCODE_Draw:
4802                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4803                         commandoffset += command->commandsize;
4804                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4805                                 commandoffset = 0;
4806                         thread->commandoffset = commandoffset;
4807                         break;
4808
4809                 case DPSOFTRAST_OPCODE_Reset:
4810                         commandoffset = 0;
4811                         break;
4812                 }
4813         }
4814         thread->commandoffset = commandoffset;
4815 }
4816
4817 #ifdef USE_THREADS
4818 static int DPSOFTRAST_Draw_Thread(void *data)
4819 {
4820         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4821         while(thread->index >= 0)
4822         {
4823                 if (thread->commandoffset != dpsoftrast.drawcommand)
4824                 {
4825                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
4826                 }
4827                 else 
4828                 {
4829                         SDL_LockMutex(thread->drawmutex);
4830                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4831                         {
4832                                 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4833                                 thread->starving = true;
4834                                 SDL_CondWait(thread->drawcond, thread->drawmutex);
4835                                 thread->starving = false;
4836                         }
4837                         SDL_UnlockMutex(thread->drawmutex);
4838                 }
4839         }   
4840         return 0;
4841 }
4842 #endif
4843
4844 static void DPSOFTRAST_Draw_FlushThreads(void)
4845 {
4846         DPSOFTRAST_State_Thread *thread;
4847         int i;
4848         DPSOFTRAST_Draw_SyncCommands();
4849 #ifdef USE_THREADS
4850         for (i = 0; i < dpsoftrast.numthreads; i++)
4851         {
4852                 thread = &dpsoftrast.threads[i];
4853                 if (thread->commandoffset != dpsoftrast.drawcommand)
4854                 {
4855                         SDL_LockMutex(thread->drawmutex);
4856                         if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4857                                 SDL_CondSignal(thread->drawcond);
4858                         SDL_UnlockMutex(thread->drawmutex);
4859                 }
4860         }
4861 #endif                  
4862         for (i = 0; i < dpsoftrast.numthreads; i++)
4863         {
4864                 thread = &dpsoftrast.threads[i];
4865 #ifdef USE_THREADS
4866                 if (thread->commandoffset != dpsoftrast.drawcommand)
4867                 {
4868                         SDL_LockMutex(thread->drawmutex);
4869                         if (thread->commandoffset != dpsoftrast.drawcommand)
4870                         {
4871                                 thread->waiting = true;
4872                                 SDL_CondWait(thread->waitcond, thread->drawmutex);
4873                                 thread->waiting = false;
4874                         }
4875                         SDL_UnlockMutex(thread->drawmutex);
4876                 }
4877 #else
4878                 if (thread->commandoffset != dpsoftrast.drawcommand)
4879                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4880 #endif
4881         }
4882         dpsoftrast.commandpool.usedcommands = 0;
4883 }
4884
4885 void DPSOFTRAST_Flush(void)
4886 {
4887         DPSOFTRAST_Draw_FlushThreads();
4888 }
4889
4890 void DPSOFTRAST_Finish(void)
4891 {
4892         DPSOFTRAST_Flush();
4893 }
4894
4895 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4896 {
4897         int i;
4898         union
4899         {
4900                 int i;
4901                 unsigned char b[4];
4902         }
4903         u;
4904         u.i = 1;
4905         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4906         dpsoftrast.bigendian = u.b[3];
4907         dpsoftrast.fb_width = width;
4908         dpsoftrast.fb_height = height;
4909         dpsoftrast.fb_depthpixels = depthpixels;
4910         dpsoftrast.fb_colorpixels[0] = colorpixels;
4911         dpsoftrast.fb_colorpixels[1] = NULL;
4912         dpsoftrast.fb_colorpixels[1] = NULL;
4913         dpsoftrast.fb_colorpixels[1] = NULL;
4914         dpsoftrast.viewport[0] = 0;
4915         dpsoftrast.viewport[1] = 0;
4916         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4917         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4918         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4919         dpsoftrast.texture_firstfree = 1;
4920         dpsoftrast.texture_end = 1;
4921         dpsoftrast.texture_max = 0;
4922         dpsoftrast.color[0] = 1;
4923         dpsoftrast.color[1] = 1;
4924         dpsoftrast.color[2] = 1;
4925         dpsoftrast.color[3] = 1;
4926 #ifdef USE_THREADS
4927         dpsoftrast.numthreads = bound(1, numthreads, 64);
4928 #else
4929         dpsoftrast.numthreads = 1;
4930 #endif
4931         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4932         for (i = 0; i < dpsoftrast.numthreads; i++)
4933         {
4934                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4935                 thread->index = i;
4936                 thread->cullface = GL_BACK;
4937                 thread->colormask[1] = 1;
4938                 thread->colormask[2] = 1;
4939                 thread->colormask[3] = 1;
4940                 thread->blendfunc[0] = GL_ONE;
4941                 thread->blendfunc[1] = GL_ZERO;
4942                 thread->depthmask = true;
4943                 thread->depthtest = true;
4944                 thread->depthfunc = GL_LEQUAL;
4945                 thread->scissortest = false;
4946                 thread->alphatest = false;
4947                 thread->alphafunc = GL_GREATER;
4948                 thread->alphavalue = 0.5f;
4949                 thread->viewport[0] = 0;
4950                 thread->viewport[1] = 0;
4951                 thread->viewport[2] = dpsoftrast.fb_width;
4952                 thread->viewport[3] = dpsoftrast.fb_height;
4953                 thread->scissor[0] = 0;
4954                 thread->scissor[1] = 0;
4955                 thread->scissor[2] = dpsoftrast.fb_width;
4956                 thread->scissor[3] = dpsoftrast.fb_height;
4957                 thread->depthrange[0] = 0;
4958                 thread->depthrange[1] = 1;
4959                 thread->polygonoffset[0] = 0;
4960                 thread->polygonoffset[1] = 0;
4961
4962                 thread->numspans = 0;
4963                 thread->numtriangles = 0;
4964                 thread->commandoffset = 0;
4965                 thread->waiting = false;
4966                 thread->starving = false;
4967 #ifdef USE_THREADS
4968                 thread->waitcond = SDL_CreateCond();
4969                 thread->drawcond = SDL_CreateCond();
4970                 thread->drawmutex = SDL_CreateMutex();
4971 #endif
4972
4973                 thread->validate = -1;
4974                 DPSOFTRAST_Validate(thread, -1);
4975 #ifdef USE_THREADS
4976                 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4977 #endif
4978         }
4979 }
4980
4981 void DPSOFTRAST_Shutdown(void)
4982 {
4983         int i;
4984 #ifdef USE_THREADS
4985         if(dpsoftrast.numthreads > 0)
4986         {
4987                 DPSOFTRAST_State_Thread *thread;
4988                 for (i = 0; i < dpsoftrast.numthreads; i++)
4989                 {
4990                         thread = &dpsoftrast.threads[i];
4991                         SDL_LockMutex(thread->drawmutex);
4992                         thread->index = -1;
4993                         SDL_CondSignal(thread->drawcond);
4994                         SDL_UnlockMutex(thread->drawmutex);
4995                         SDL_WaitThread(thread->thread, NULL);
4996                         SDL_DestroyCond(thread->waitcond);
4997                         SDL_DestroyCond(thread->drawcond);
4998                         SDL_DestroyMutex(thread->drawmutex);
4999                 }
5000         }
5001 #endif
5002         for (i = 0;i < dpsoftrast.texture_end;i++)
5003                 if (dpsoftrast.texture[i].bytes)
5004                         MM_FREE(dpsoftrast.texture[i].bytes);
5005         if (dpsoftrast.texture)
5006                 free(dpsoftrast.texture);
5007         if (dpsoftrast.threads)
5008                 MM_FREE(dpsoftrast.threads);
5009         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5010 }
5011