]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
clear scissor fix
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "dpsoftrast.h"
7
8 #ifdef USE_SDL
9 #define USE_THREADS
10 #endif
11
12 #ifdef USE_THREADS
13 #include <SDL.h>
14 #include <SDL_thread.h>
15 #endif
16
17 #ifndef __cplusplus
18 typedef qboolean bool;
19 #endif
20
21 #define ALIGN_SIZE 16
22 #define ATOMIC_SIZE 32
23
24 #ifdef SSE2_PRESENT
25         #if defined(__GNUC__)
26                 #define ALIGN(var) var __attribute__((__aligned__(16)))
27                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
28                 #ifdef USE_THREADS
29                         #define MEMORY_BARRIER (_mm_sfence())
30                         //(__sync_synchronize())
31                         #define ATOMIC_COUNTER volatile int
32                         #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33                         #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34                         #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35                 #endif
36         #elif defined(_MSC_VER)
37                 #define ALIGN(var) __declspec(align(16)) var
38                 #define ATOMIC(var) __declspec(align(32)) var
39                 #ifdef USE_THREADS
40                         #define MEMORY_BARRIER (_mm_sfence())
41                         //(MemoryBarrier())
42                         #define ATOMIC_COUNTER volatile LONG
43                         #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
44                         #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
45                         #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
46                 #endif
47         #else
48                 #undef USE_THREADS
49                 #undef SSE2_PRESENT
50         #endif
51 #endif
52
53 #ifndef SSE2_PRESENT
54         #define ALIGN(var) var
55         #define ATOMIC(var) var
56 #endif
57
58 #ifndef USE_THREADS
59         #define MEMORY_BARRIER ((void)0)
60         #define ATOMIC_COUNTER int
61         #define ATOMIC_INCREMENT(counter) (++(counter))
62         #define ATOMIC_DECREMENT(counter) (--(counter))
63         #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
64 #endif
65
66 #ifdef SSE2_PRESENT
67 #include <emmintrin.h>
68
69 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
70
71 static void *MM_CALLOC(size_t nmemb, size_t size)
72 {
73         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
74         if(ptr != NULL) memset(ptr, 0, nmemb*size);
75         return ptr;
76 }
77
78 #define MM_FREE _mm_free
79 #else
80 #define MM_MALLOC(size) malloc(size)
81 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
82 #define MM_FREE free
83 #endif
84
85 typedef enum DPSOFTRAST_ARRAY_e
86 {
87         DPSOFTRAST_ARRAY_POSITION,
88         DPSOFTRAST_ARRAY_COLOR,
89         DPSOFTRAST_ARRAY_TEXCOORD0,
90         DPSOFTRAST_ARRAY_TEXCOORD1,
91         DPSOFTRAST_ARRAY_TEXCOORD2,
92         DPSOFTRAST_ARRAY_TEXCOORD3,
93         DPSOFTRAST_ARRAY_TEXCOORD4,
94         DPSOFTRAST_ARRAY_TEXCOORD5,
95         DPSOFTRAST_ARRAY_TEXCOORD6,
96         DPSOFTRAST_ARRAY_TEXCOORD7,
97         DPSOFTRAST_ARRAY_TOTAL
98 }
99 DPSOFTRAST_ARRAY;
100
101 typedef struct DPSOFTRAST_Texture_s
102 {
103         int flags;
104         int width;
105         int height;
106         int depth;
107         int sides;
108         DPSOFTRAST_TEXTURE_FILTER filter;
109         int mipmaps;
110         int size;
111         ATOMIC_COUNTER binds;
112         unsigned char *bytes;
113         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
114 }
115 DPSOFTRAST_Texture;
116
117 #define COMMAND_SIZE ALIGN_SIZE
118 #define COMMAND_ALIGN(var) ALIGN(var)
119
120 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
121 {
122         unsigned char opcode;
123         unsigned short commandsize;
124 }
125 DPSOFTRAST_Command);
126
127 enum { DPSOFTRAST_OPCODE_Reset = 0 };
128
129 #define DEFCOMMAND(opcodeval, name, fields) \
130         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
131         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
132         { \
133                 unsigned char opcode; \
134                 unsigned short commandsize; \
135                 fields \
136         } DPSOFTRAST_Command_##name );
137
138 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
139 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
140
141 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
142 {
143         int freecommand;
144         int usedcommands;
145         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
146 }
147 DPSOFTRAST_State_Command_Pool);
148
149 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
150 {
151         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
152         float w[3];
153         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
154 }
155 DPSOFTRAST_State_Triangle);
156
157 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
158         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
159         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
160                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
161                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
162 }
163 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
164         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
165         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
166         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
167         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
168         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
169         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
170         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
171         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
172 }
173                                         
174 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
175
176 typedef ALIGN(struct DPSOFTRAST_State_Span_s
177 {
178         int triangle; // triangle this span was generated by
179         int x; // framebuffer x coord
180         int y; // framebuffer y coord
181         int length; // pixel count
182         int startx; // usable range (according to pixelmask)
183         int endx; // usable range (according to pixelmask)
184         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
185 }
186 DPSOFTRAST_State_Span);
187
188 #define DPSOFTRAST_DRAW_MAXSPANS 1024
189 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
190
191 #define DPSOFTRAST_VALIDATE_FB 1
192 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
193 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
194 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
195
196 typedef enum DPSOFTRAST_BLENDMODE_e
197 {
198         DPSOFTRAST_BLENDMODE_OPAQUE,
199         DPSOFTRAST_BLENDMODE_ALPHA,
200         DPSOFTRAST_BLENDMODE_ADDALPHA,
201         DPSOFTRAST_BLENDMODE_ADD,
202         DPSOFTRAST_BLENDMODE_INVMOD,
203         DPSOFTRAST_BLENDMODE_MUL,
204         DPSOFTRAST_BLENDMODE_MUL2,
205         DPSOFTRAST_BLENDMODE_SUBALPHA,
206         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
207         DPSOFTRAST_BLENDMODE_TOTAL
208 }
209 DPSOFTRAST_BLENDMODE;
210
211 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
212 {
213 #ifdef USE_THREADS
214         SDL_Thread *thread;
215 #endif
216         int index;
217         
218         int cullface;
219         int colormask[4];
220         int blendfunc[2];
221         int blendsubtract;
222         int depthmask;
223         int depthtest;
224         int depthfunc;
225         int scissortest;
226         int alphatest;
227         int alphafunc;
228         float alphavalue;
229         int viewport[4];
230         int scissor[4];
231         float depthrange[2];
232         float polygonoffset[2];
233
234         int shader_mode;
235         int shader_permutation;
236
237         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
238         
239         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
240         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
241
242         // DPSOFTRAST_VALIDATE_ flags
243         int validate;
244
245         // derived values (DPSOFTRAST_VALIDATE_FB)
246         int fb_colormask;
247         int fb_clearscissor[4];
248         ALIGN(float fb_viewportcenter[4]);
249         ALIGN(float fb_viewportscale[4]);
250
251         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
252         int fb_depthfunc;
253
254         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
255         int fb_blendmode;
256
257         ATOMIC(volatile int commandoffset);
258
259         volatile bool waiting;
260         volatile bool starving;
261 #ifdef USE_THREADS
262         SDL_cond *waitcond;
263         SDL_cond *drawcond;
264         SDL_mutex *drawmutex;
265 #endif
266
267         int numspans;
268         int numtriangles;
269         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
270         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
271 }
272 DPSOFTRAST_State_Thread);
273
274 typedef ATOMIC(struct DPSOFTRAST_State_s
275 {
276         int fb_width;
277         int fb_height;
278         unsigned int *fb_depthpixels;
279         unsigned int *fb_colorpixels[4];
280
281         int viewport[4];
282         ALIGN(float fb_viewportcenter[4]);
283         ALIGN(float fb_viewportscale[4]);
284
285         float color[4];
286         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
287         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
288
289         const float *pointer_vertex3f;
290         const float *pointer_color4f;
291         const unsigned char *pointer_color4ub;
292         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
293         int stride_vertex;
294         int stride_color;
295         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
296         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
297         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
298
299         int firstvertex;
300         int numvertices;
301         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
302         float *screencoord4f;
303         int drawstarty;
304         int drawendy;
305         int drawclipped;
306         
307         int shader_mode;
308         int shader_permutation;
309
310         int texture_max;
311         int texture_end;
312         int texture_firstfree;
313         DPSOFTRAST_Texture *texture;
314
315         int bigendian;
316
317         // error reporting
318         const char *errorstring;
319
320         int numthreads;
321         DPSOFTRAST_State_Thread *threads;
322
323         ATOMIC(volatile int drawcommand);
324
325         DPSOFTRAST_State_Command_Pool commandpool;
326 }
327 DPSOFTRAST_State);
328
329 DPSOFTRAST_State dpsoftrast;
330
331 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
332 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
333 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
334 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
335 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
336
337 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
338 {
339         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
340         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
341         fb_viewportcenter[3] = 0.5f;
342         fb_viewportcenter[0] = 0.0f;
343         fb_viewportscale[1] = 0.5f * viewport[2];
344         fb_viewportscale[2] = -0.5f * viewport[3];
345         fb_viewportscale[3] = 0.5f;
346         fb_viewportscale[0] = 1.0f;
347 }
348
349 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
350 {
351         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
352         // and viewport projection values
353         int x1, x2;
354         int y1, y2;
355         x1 = thread->scissor[0];
356         x2 = thread->scissor[0] + thread->scissor[2];
357         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
358         y2 = dpsoftrast.fb_height - thread->scissor[1];
359         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
360         if (x1 < 0) x1 = 0;
361         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
362         if (y1 < 0) y1 = 0;
363         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
364         thread->fb_clearscissor[0] = x1;
365         thread->fb_clearscissor[1] = y1;
366         thread->fb_clearscissor[2] = x2 - x1;
367         thread->fb_clearscissor[3] = y2 - y1;
368
369         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
370 }
371
372 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
373 {
374         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
375 }
376
377 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
378 {
379         if (thread->blendsubtract)
380         {
381                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
382                 {
383                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
384                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
385                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
386                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
387                 }
388         }
389         else
390         {       
391                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
392                 {
393                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
394                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
395                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
396                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
397                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
398                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
399                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
400                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
401                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
402                 BLENDFUNC(GL_SRC_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
403                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
404                 }
405         }
406 }
407
408 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
409
410 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
411 {
412         mask &= thread->validate;
413         if (!mask)
414                 return;
415         if (mask & DPSOFTRAST_VALIDATE_FB)
416         {
417                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
418                 DPSOFTRAST_RecalcFB(thread);
419         }
420         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
421         {
422                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
423                 DPSOFTRAST_RecalcDepthFunc(thread);
424         }
425         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
426         {
427                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
428                 DPSOFTRAST_RecalcBlendFunc(thread);
429         }
430 }
431
432 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
433 {
434         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
435                 return &dpsoftrast.texture[index];
436         return NULL;
437 }
438
439 static void DPSOFTRAST_Texture_Grow(void)
440 {
441         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
442         DPSOFTRAST_State_Thread *thread;
443         int i;
444         int j;
445         DPSOFTRAST_Flush();
446         // expand texture array as needed
447         if (dpsoftrast.texture_max < 1024)
448                 dpsoftrast.texture_max = 1024;
449         else
450                 dpsoftrast.texture_max *= 2;
451         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
452         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
453                 if(dpsoftrast.texbound[i])
454                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
455         for (j = 0; j < dpsoftrast.numthreads; j++)
456         {
457                 thread = &dpsoftrast.threads[j];
458                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
459                         if(thread->texbound[i])
460                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
461         }
462 }
463
464 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
465 {
466         int w;
467         int h;
468         int d;
469         int size;
470         int s;
471         int texnum;
472         int mipmaps;
473         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
474         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
475         DPSOFTRAST_Texture *texture;
476         if (width*height*depth < 1)
477         {
478                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
479                 return 0;
480         }
481         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
482         {
483                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
484                 return 0;
485         }
486         switch(texformat)
487         {
488         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
489         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
490         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
491                 break;
492         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
493                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
494                 {
495                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
496                         return 0;
497                 }
498                 if (depth != 1)
499                 {
500                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
501                         return 0;
502                 }
503                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
504                 {
505                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
506                         return 0;
507                 }
508                 break;
509         }
510         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
511         {
512                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
513                 return 0;
514         }
515         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
516         {
517                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
518                 return 0;
519         }
520         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
521         {
522                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
523                 return 0;
524         }
525         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
526         {
527                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
528                 return 0;
529         }
530         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
531         {
532                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
533                 return 0;
534         }
535         // find first empty slot in texture array
536         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
537                 if (!dpsoftrast.texture[texnum].bytes)
538                         break;
539         dpsoftrast.texture_firstfree = texnum + 1;
540         if (dpsoftrast.texture_max <= texnum)
541                 DPSOFTRAST_Texture_Grow();
542         if (dpsoftrast.texture_end <= texnum)
543                 dpsoftrast.texture_end = texnum + 1;
544         texture = &dpsoftrast.texture[texnum];
545         memset(texture, 0, sizeof(*texture));
546         texture->flags = flags;
547         texture->width = width;
548         texture->height = height;
549         texture->depth = depth;
550         texture->sides = sides;
551         texture->binds = 0;
552         w = width;
553         h = height;
554         d = depth;
555         size = 0;
556         mipmaps = 0;
557         w = width;
558         h = height;
559         d = depth;
560         for (;;)
561         {
562                 s = w * h * d * sides * 4;
563                 texture->mipmap[mipmaps][0] = size;
564                 texture->mipmap[mipmaps][1] = s;
565                 texture->mipmap[mipmaps][2] = w;
566                 texture->mipmap[mipmaps][3] = h;
567                 texture->mipmap[mipmaps][4] = d;
568                 size += s;
569                 mipmaps++;
570                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
571                         break;
572                 if (w > 1) w >>= 1;
573                 if (h > 1) h >>= 1;
574                 if (d > 1) d >>= 1;
575         }
576         texture->mipmaps = mipmaps;
577         texture->size = size;
578
579         // allocate the pixels now
580         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
581
582         return texnum;
583 }
584 void DPSOFTRAST_Texture_Free(int index)
585 {
586         DPSOFTRAST_Texture *texture;
587         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
588         if (texture->binds)
589                 DPSOFTRAST_Flush();
590         if (texture->bytes)
591                 MM_FREE(texture->bytes);
592         texture->bytes = NULL;
593         memset(texture, 0, sizeof(*texture));
594         // adjust the free range and used range
595         if (dpsoftrast.texture_firstfree > index)
596                 dpsoftrast.texture_firstfree = index;
597         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
598                 dpsoftrast.texture_end--;
599 }
600 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
601 {
602         int i, x, y, z, w, layer0, layer1, row0, row1;
603         unsigned char *o, *i0, *i1, *i2, *i3;
604         DPSOFTRAST_Texture *texture;
605         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
606         if (texture->mipmaps <= 1)
607                 return;
608         for (i = 1;i < texture->mipmaps;i++)
609         {
610                 for (z = 0;z < texture->mipmap[i][4];z++)
611                 {
612                         layer0 = z*2;
613                         layer1 = z*2+1;
614                         if (layer1 >= texture->mipmap[i-1][4])
615                                 layer1 = texture->mipmap[i-1][4]-1;
616                         for (y = 0;y < texture->mipmap[i][3];y++)
617                         {
618                                 row0 = y*2;
619                                 row1 = y*2+1;
620                                 if (row1 >= texture->mipmap[i-1][3])
621                                         row1 = texture->mipmap[i-1][3]-1;
622                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
623                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
624                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
625                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
626                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
627                                 w = texture->mipmap[i][2];
628                                 if (layer1 > layer0)
629                                 {
630                                         if (texture->mipmap[i-1][2] > 1)
631                                         {
632                                                 // average 3D texture
633                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
634                                                 {
635                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
636                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
637                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
638                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
639                                                 }
640                                         }
641                                         else
642                                         {
643                                                 // average 3D mipmap with parent width == 1
644                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
645                                                 {
646                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
647                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
648                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
649                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
650                                                 }
651                                         }
652                                 }
653                                 else
654                                 {
655                                         if (texture->mipmap[i-1][2] > 1)
656                                         {
657                                                 // average 2D texture (common case)
658                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
659                                                 {
660                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
661                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
662                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
663                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
664                                                 }
665                                         }
666                                         else
667                                         {
668                                                 // 2D texture with parent width == 1
669                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
670                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
671                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
672                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
673                                         }
674                                 }
675                         }
676                 }
677         }
678 }
679 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
680 {
681         DPSOFTRAST_Texture *texture;
682         unsigned char *dst;
683         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
684         if (texture->binds)
685                 DPSOFTRAST_Flush();
686         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
687         while (blockheight > 0)
688         {
689                 memcpy(dst, pixels, blockwidth * 4);
690                 pixels += blockwidth * 4;
691                 dst += texture->mipmap[0][2] * 4;
692                 blockheight--;
693         }
694         DPSOFTRAST_Texture_CalculateMipmaps(index);
695 }
696 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
697 {
698         DPSOFTRAST_Texture *texture;
699         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
700         if (texture->binds)
701                 DPSOFTRAST_Flush();
702         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
703         DPSOFTRAST_Texture_CalculateMipmaps(index);
704 }
705 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
706 {
707         DPSOFTRAST_Texture *texture;
708         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
709         return texture->mipmap[mip][2];
710 }
711 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
712 {
713         DPSOFTRAST_Texture *texture;
714         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
715         return texture->mipmap[mip][3];
716 }
717 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
718 {
719         DPSOFTRAST_Texture *texture;
720         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
721         return texture->mipmap[mip][4];
722 }
723 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
724 {
725         DPSOFTRAST_Texture *texture;
726         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
727         if (texture->binds)
728                 DPSOFTRAST_Flush();
729         return texture->bytes + texture->mipmap[mip][0];
730 }
731 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
732 {
733         DPSOFTRAST_Texture *texture;
734         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
735         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
736         {
737                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
738                 return;
739         }
740         if (texture->binds)
741                 DPSOFTRAST_Flush();
742         texture->filter = filter;
743 }
744
745 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
746 {
747         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
748                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
749                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
750                 DPSOFTRAST_Flush();
751         dpsoftrast.fb_width = width;
752         dpsoftrast.fb_height = height;
753         dpsoftrast.fb_depthpixels = depthpixels;
754         dpsoftrast.fb_colorpixels[0] = colorpixels0;
755         dpsoftrast.fb_colorpixels[1] = colorpixels1;
756         dpsoftrast.fb_colorpixels[2] = colorpixels2;
757         dpsoftrast.fb_colorpixels[3] = colorpixels3;
758 }
759
760 static void DPSOFTRAST_Draw_FlushThreads(void);
761
762 static void DPSOFTRAST_Draw_SyncCommands(void)
763 {
764         MEMORY_BARRIER;
765         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
766 }
767
768 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
769 {
770 #ifdef USE_THREADS
771         DPSOFTRAST_State_Thread *thread;
772         int i;
773         int freecommand = dpsoftrast.commandpool.freecommand;
774         int usedcommands = dpsoftrast.commandpool.usedcommands;
775         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
776                 return;
777         DPSOFTRAST_Draw_SyncCommands();
778         for(;;)
779         {
780                 int waitindex = -1;
781                 int commandoffset;
782                 usedcommands = 0;
783                 for (i = 0; i < dpsoftrast.numthreads; i++)
784                 {
785                         thread = &dpsoftrast.threads[i]; 
786                         commandoffset = freecommand - thread->commandoffset;
787                         if (commandoffset < 0)
788                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
789                         if (commandoffset > usedcommands)
790                         {
791                                 waitindex = i;
792                                 usedcommands = commandoffset;
793                         }
794                 }
795                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
796                         break;
797                 thread = &dpsoftrast.threads[waitindex];
798                 SDL_LockMutex(thread->drawmutex);
799                 if (thread->commandoffset != dpsoftrast.drawcommand)
800                 {
801                         thread->waiting = true;
802                         if (thread->starving) SDL_CondSignal(thread->drawcond);
803                         SDL_CondWait(thread->waitcond, thread->drawmutex);
804                         thread->waiting = false;
805                 }
806                 SDL_UnlockMutex(thread->drawmutex);
807         }
808         dpsoftrast.commandpool.usedcommands = usedcommands;
809 #else
810         DPSOFTRAST_Draw_FlushThreads();
811 #endif
812 }
813
814 #define DPSOFTRAST_ALIGNCOMMAND(size) \
815         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
816 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
817         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
818
819 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
820 {
821         DPSOFTRAST_Command *command;
822         int freecommand = dpsoftrast.commandpool.freecommand;
823         int usedcommands = dpsoftrast.commandpool.usedcommands;
824         int extra = sizeof(DPSOFTRAST_Command);
825         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
826                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
827         if(usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
828         {
829                 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
830                 freecommand = dpsoftrast.commandpool.freecommand;
831                 usedcommands = dpsoftrast.commandpool.usedcommands;
832         }
833         if(DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
834         {
835                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
836                 command->opcode = DPSOFTRAST_OPCODE_Reset;
837                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
838                 freecommand = 0;
839         }
840         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
841         command->opcode = opcode;
842         command->commandsize = size;
843         freecommand += size;
844         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
845                 freecommand = 0;
846         dpsoftrast.commandpool.freecommand = freecommand;
847         dpsoftrast.commandpool.usedcommands = usedcommands + size;
848         return command;
849 }
850
851 static void DPSOFTRAST_UndoCommand(int size)
852 {
853         int freecommand = dpsoftrast.commandpool.freecommand;
854         int usedcommands = dpsoftrast.commandpool.usedcommands;
855         freecommand -= size;
856         usedcommands -= size;
857         dpsoftrast.commandpool.freecommand = freecommand;
858         dpsoftrast.commandpool.usedcommands = usedcommands;
859 }
860                 
861 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
862 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
863 {
864         thread->viewport[0] = command->x;
865         thread->viewport[1] = command->y;
866         thread->viewport[2] = command->width;
867         thread->viewport[3] = command->height;
868         thread->validate |= DPSOFTRAST_VALIDATE_FB;
869 }
870 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
871 {
872         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
873         command->x = x;
874         command->y = y;
875         command->width = width;
876         command->height = height;
877
878         dpsoftrast.viewport[0] = x;
879         dpsoftrast.viewport[1] = y;
880         dpsoftrast.viewport[2] = width;
881         dpsoftrast.viewport[3] = height;
882         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
883 }
884
885 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
886 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
887 {
888         int i, x1, y1, x2, y2, w, h, x, y, t1, t2;
889         unsigned int *p;
890         unsigned int c;
891         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
892         x1 = thread->fb_clearscissor[0];
893         y1 = thread->fb_clearscissor[1];
894         x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
895         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
896         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
897         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
898         if(y1 < t1) y1 = t1;
899         if(y2 > t2) y2 = t2;
900         w = x2 - x1;
901         h = y2 - y1;
902         if (w < 1 || h < 1)
903                 return;
904         // FIXME: honor fb_colormask?
905         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
906         for (i = 0;i < 4;i++)
907         {
908                 if (!dpsoftrast.fb_colorpixels[i])
909                         continue;
910                 for (y = y1;y < y2;y++)
911                 {
912                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
913                         for (x = x1;x < x2;x++)
914                                 p[x] = c;
915                 }
916         }
917 }
918 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
919 {
920         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
921         command->r = r;
922         command->g = g;
923         command->b = b;
924         command->a = a;
925 }
926
927 DEFCOMMAND(3, ClearDepth, float depth;)
928 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
929 {
930         int x1, y1, x2, y2, w, h, x, y, t1, t2;
931         unsigned int *p;
932         unsigned int c;
933         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
934         x1 = thread->fb_clearscissor[0];
935         y1 = thread->fb_clearscissor[1];
936         x2 = thread->fb_clearscissor[0] + thread->fb_clearscissor[2];
937         y2 = thread->fb_clearscissor[1] + thread->fb_clearscissor[3];
938         t1 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
939         t2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
940         if(y1 < t1) y1 = t1;
941         if(y2 > t2) y2 = t2;
942         w = x2 - x1;
943         h = y2 - y1;
944         if (w < 1 || h < 1)
945                 return;
946         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
947         for (y = y1;y < y2;y++)
948         {
949                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
950                 for (x = x1;x < x2;x++)
951                         p[x] = c;
952         }
953 }
954 void DPSOFTRAST_ClearDepth(float d)
955 {
956         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
957         command->depth = d;
958 }
959
960 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
961 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
962 {
963         thread->colormask[0] = command->r != 0;
964         thread->colormask[1] = command->g != 0;
965         thread->colormask[2] = command->b != 0;
966         thread->colormask[3] = command->a != 0;
967         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
968 }
969 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
970 {
971         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
972         command->r = r;
973         command->g = g;
974         command->b = b;
975         command->a = a;
976 }
977
978 DEFCOMMAND(5, DepthTest, int enable;)
979 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
980 {
981         thread->depthtest = command->enable;
982         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
983 }
984 void DPSOFTRAST_DepthTest(int enable)
985 {
986         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
987         command->enable = enable;
988 }
989
990 DEFCOMMAND(6, ScissorTest, int enable;)
991 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
992 {
993         thread->scissortest = command->enable;
994         thread->validate |= DPSOFTRAST_VALIDATE_FB;
995 }
996 void DPSOFTRAST_ScissorTest(int enable)
997 {
998         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
999         command->enable = enable;
1000 }
1001
1002 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1003 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1004 {
1005         thread->scissor[0] = command->x;
1006         thread->scissor[1] = command->y;
1007         thread->scissor[2] = command->width;
1008         thread->scissor[3] = command->height;
1009         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1010 }
1011 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1012 {
1013         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1014         command->x = x;
1015         command->y = y;
1016         command->width = width;
1017         command->height = height;
1018 }
1019
1020 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1021 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1022 {
1023         thread->blendfunc[0] = command->sfactor;
1024         thread->blendfunc[1] = command->dfactor;
1025         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1026 }
1027 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1028 {
1029         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1030         command->sfactor = sfactor;
1031         command->dfactor = dfactor;
1032 }
1033
1034 DEFCOMMAND(9, BlendSubtract, int enable;)
1035 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1036 {
1037         thread->blendsubtract = command->enable;
1038         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1039 }
1040 void DPSOFTRAST_BlendSubtract(int enable)
1041 {
1042         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1043         command->enable = enable;
1044 }
1045
1046 DEFCOMMAND(10, DepthMask, int enable;)
1047 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1048 {
1049         thread->depthmask = command->enable;
1050 }
1051 void DPSOFTRAST_DepthMask(int enable)
1052 {
1053         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1054         command->enable = enable;
1055 }
1056
1057 DEFCOMMAND(11, DepthFunc, int func;)
1058 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1059 {
1060         thread->depthfunc = command->func;
1061 }
1062 void DPSOFTRAST_DepthFunc(int func)
1063 {
1064         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1065         command->func = func;
1066 }
1067
1068 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1069 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1070 {
1071         thread->depthrange[0] = command->nearval;
1072         thread->depthrange[1] = command->farval;
1073 }
1074 void DPSOFTRAST_DepthRange(float nearval, float farval)
1075 {
1076         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1077         command->nearval = nearval;
1078         command->farval = farval;
1079 }
1080
1081 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1082 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1083 {
1084         thread->polygonoffset[0] = command->alongnormal;
1085         thread->polygonoffset[1] = command->intoview;
1086 }
1087 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1088 {
1089         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1090         command->alongnormal = alongnormal;
1091         command->intoview = intoview;
1092 }
1093
1094 DEFCOMMAND(14, CullFace, int mode;)
1095 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1096 {
1097         thread->cullface = command->mode;
1098 }
1099 void DPSOFTRAST_CullFace(int mode)
1100 {
1101         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1102         command->mode = mode;
1103 }
1104
1105 DEFCOMMAND(15, AlphaTest, int enable;)
1106 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1107 {
1108         thread->alphatest = command->enable;
1109 }
1110 void DPSOFTRAST_AlphaTest(int enable)
1111 {
1112         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1113         command->enable = enable;
1114 }
1115
1116 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1117 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1118 {
1119         thread->alphafunc = command->func;
1120         thread->alphavalue = command->ref;
1121 }
1122 void DPSOFTRAST_AlphaFunc(int func, float ref)
1123 {
1124         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1125         command->func = func;
1126         command->ref = ref;
1127 }
1128
1129 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1130 {
1131         dpsoftrast.color[0] = r;
1132         dpsoftrast.color[1] = g;
1133         dpsoftrast.color[2] = b;
1134         dpsoftrast.color[3] = a;
1135 }
1136
1137 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1138 {
1139         int outstride = blockwidth * 4;
1140         int instride = dpsoftrast.fb_width * 4;
1141         int bx1 = blockx;
1142         int by1 = blocky;
1143         int bx2 = blockx + blockwidth;
1144         int by2 = blocky + blockheight;
1145         int bw;
1146         int bh;
1147         int x;
1148         int y;
1149         unsigned char *inpixels;
1150         unsigned char *b;
1151         unsigned char *o;
1152         DPSOFTRAST_Flush();
1153         if (bx1 < 0) bx1 = 0;
1154         if (by1 < 0) by1 = 0;
1155         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1156         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1157         bw = bx2 - bx1;
1158         bh = by2 - by1;
1159         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1160         if (dpsoftrast.bigendian)
1161         {
1162                 for (y = by1;y < by2;y++)
1163                 {
1164                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1165                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1166                         for (x = bx1;x < bx2;x++)
1167                         {
1168                                 o[0] = b[3];
1169                                 o[1] = b[2];
1170                                 o[2] = b[1];
1171                                 o[3] = b[0];
1172                                 o += 4;
1173                                 b += 4;
1174                         }
1175                 }
1176         }
1177         else
1178         {
1179                 for (y = by1;y < by2;y++)
1180                 {
1181                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1182                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1183                         memcpy(o, b, bw*4);
1184                 }
1185         }
1186
1187 }
1188 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1189 {
1190         int tx1 = tx;
1191         int ty1 = ty;
1192         int tx2 = tx + width;
1193         int ty2 = ty + height;
1194         int sx1 = sx;
1195         int sy1 = sy;
1196         int sx2 = sx + width;
1197         int sy2 = sy + height;
1198         int swidth;
1199         int sheight;
1200         int twidth;
1201         int theight;
1202         int sw;
1203         int sh;
1204         int tw;
1205         int th;
1206         int y;
1207         unsigned int *spixels;
1208         unsigned int *tpixels;
1209         DPSOFTRAST_Texture *texture;
1210         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1211         if (mip < 0 || mip >= texture->mipmaps) return;
1212         if (texture->binds)
1213                 DPSOFTRAST_Flush();
1214         spixels = dpsoftrast.fb_colorpixels[0];
1215         swidth = dpsoftrast.fb_width;
1216         sheight = dpsoftrast.fb_height;
1217         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1218         twidth = texture->mipmap[mip][2];
1219         theight = texture->mipmap[mip][3];
1220         if (tx1 < 0) tx1 = 0;
1221         if (ty1 < 0) ty1 = 0;
1222         if (tx2 > twidth) tx2 = twidth;
1223         if (ty2 > theight) ty2 = theight;
1224         if (sx1 < 0) sx1 = 0;
1225         if (sy1 < 0) sy1 = 0;
1226         if (sx2 > swidth) sx2 = swidth;
1227         if (sy2 > sheight) sy2 = sheight;
1228         tw = tx2 - tx1;
1229         th = ty2 - ty1;
1230         sw = sx2 - sx1;
1231         sh = sy2 - sy1;
1232         if (tw > sw) tw = sw;
1233         if (th > sh) th = sh;
1234         if (tw < 1 || th < 1)
1235                 return;
1236         for (y = 0;y < th;y++)
1237                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1238         if (texture->mipmaps > 1)
1239                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1240 }
1241
1242 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1243 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1244 {
1245         if (thread->texbound[command->unitnum])
1246                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1247         thread->texbound[command->unitnum] = command->texture;
1248 }
1249 void DPSOFTRAST_SetTexture(int unitnum, int index)
1250 {
1251         DPSOFTRAST_Command_SetTexture *command;
1252         DPSOFTRAST_Texture *texture;
1253         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1254         {
1255                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1256                 return;
1257         }
1258         texture = DPSOFTRAST_Texture_GetByIndex(index);
1259         if (index && !texture)
1260         {
1261                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1262                 return;
1263         }
1264
1265         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1266         command->unitnum = unitnum;
1267         command->texture = texture;
1268
1269         dpsoftrast.texbound[unitnum] = texture;
1270         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1271 }
1272
1273 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1274 {
1275         dpsoftrast.pointer_vertex3f = vertex3f;
1276         dpsoftrast.stride_vertex = stride;
1277 }
1278 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1279 {
1280         dpsoftrast.pointer_color4f = color4f;
1281         dpsoftrast.pointer_color4ub = NULL;
1282         dpsoftrast.stride_color = stride;
1283 }
1284 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1285 {
1286         dpsoftrast.pointer_color4f = NULL;
1287         dpsoftrast.pointer_color4ub = color4ub;
1288         dpsoftrast.stride_color = stride;
1289 }
1290 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1291 {
1292         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1293         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1294         dpsoftrast.stride_texcoord[unitnum] = stride;
1295 }
1296
1297 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1298 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1299 {
1300         thread->shader_mode = command->mode;
1301         thread->shader_permutation = command->permutation;
1302 }
1303 void DPSOFTRAST_SetShader(int mode, int permutation)
1304 {
1305         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1306         command->mode = mode;
1307         command->permutation = permutation;
1308
1309         dpsoftrast.shader_mode = mode;
1310         dpsoftrast.shader_permutation = permutation;
1311 }
1312
1313 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1314 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1315 {
1316         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1317 }
1318 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1319 {
1320         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1321         command->index = index;
1322         command->val[0] = v0;
1323         command->val[1] = v1;
1324         command->val[2] = v2;
1325         command->val[3] = v3;
1326
1327         dpsoftrast.uniform4f[index*4+0] = v0;
1328         dpsoftrast.uniform4f[index*4+1] = v1;
1329         dpsoftrast.uniform4f[index*4+2] = v2;
1330         dpsoftrast.uniform4f[index*4+3] = v3;
1331 }
1332 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1333 {
1334         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335         command->index = index;
1336         memcpy(command->val, v, sizeof(command->val));
1337
1338         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1339 }
1340
1341 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1342 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1343 {
1344         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1345 }
1346 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1347 {
1348 #ifdef SSE2_PRESENT
1349         int i, index;
1350         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1351         {
1352                 __m128 m0, m1, m2, m3;
1353                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1354                 command->index = index;
1355                 if (((size_t)v)&(ALIGN_SIZE-1))
1356                 {
1357                         m0 = _mm_loadu_ps(v);
1358                         m1 = _mm_loadu_ps(v+4);
1359                         m2 = _mm_loadu_ps(v+8);
1360                         m3 = _mm_loadu_ps(v+12);
1361                 }
1362                 else
1363                 {
1364                         m0 = _mm_load_ps(v);
1365                         m1 = _mm_load_ps(v+4);
1366                         m2 = _mm_load_ps(v+8);
1367                         m3 = _mm_load_ps(v+12);
1368                 }
1369                 if (transpose)
1370                 {
1371                         __m128 t0, t1, t2, t3;
1372                         t0 = _mm_unpacklo_ps(m0, m1);
1373                         t1 = _mm_unpacklo_ps(m2, m3);
1374                         t2 = _mm_unpackhi_ps(m0, m1);
1375                         t3 = _mm_unpackhi_ps(m2, m3);
1376                         m0 = _mm_movelh_ps(t0, t1);
1377                         m1 = _mm_movehl_ps(t1, t0);
1378                         m2 = _mm_movelh_ps(t2, t3);
1379                         m3 = _mm_movehl_ps(t3, t2);                     
1380                 }
1381                 _mm_store_ps(command->val, m0);
1382                 _mm_store_ps(command->val+4, m1);
1383                 _mm_store_ps(command->val+8, m2);
1384                 _mm_store_ps(command->val+12, m3);
1385                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1386                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1387                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1388                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1389         }
1390 #endif
1391 }
1392
1393 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1394 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1395 {
1396         thread->uniform1i[command->index] = command->val;
1397 }
1398 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1399 {
1400         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1401         command->index = index;
1402         command->val = i0;
1403
1404         dpsoftrast.uniform1i[command->index] = i0;
1405 }
1406
1407 #ifdef SSE2_PRESENT
1408 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1409 {
1410         float *end = dst + size*4;
1411         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1412         {
1413                 while (dst < end)
1414                 {
1415                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1416                         dst += 4;
1417                         src += stride;
1418                 }
1419         }
1420         else
1421         {
1422                 while (dst < end)
1423                 {
1424                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1425                         dst += 4;
1426                         src += stride;
1427                 }
1428         }
1429 }
1430
1431 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1432 {
1433         float *end = dst + size*4;
1434         if (stride == sizeof(float[3]))
1435         {
1436                 float *end4 = dst + (size&~3)*4;        
1437                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1438                 {
1439                         while (dst < end4)
1440                         {
1441                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1442                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1443                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1444                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1445                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1446                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1447                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1448                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1449                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1450                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1451                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1452                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1453                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1454                                 dst += 16;
1455                                 src += 4*sizeof(float[3]);
1456                         }
1457                 }
1458                 else
1459                 {
1460                         while (dst < end4)
1461                         {
1462                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1463                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1464                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1467                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1468                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1469                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1470                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1471                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1472                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1473                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1474                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475                                 dst += 16;
1476                                 src += 4*sizeof(float[3]);
1477                         }
1478                 }
1479         }
1480         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1481         {
1482                 while (dst < end)
1483                 {
1484                         __m128 v = _mm_loadu_ps((const float *)src);
1485                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1486                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1487                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1488                         _mm_store_ps(dst, v);
1489                         dst += 4;
1490                         src += stride;
1491                 }
1492         }
1493         else
1494         {
1495                 while (dst < end)
1496                 {
1497                         __m128 v = _mm_load_ps((const float *)src);
1498                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1499                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1500                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1501                         _mm_store_ps(dst, v);
1502                         dst += 4;
1503                         src += stride;
1504                 }
1505         }
1506 }
1507
1508 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1509 {
1510         float *end = dst + size*4;
1511         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1512         if (stride == sizeof(float[2]))
1513         {
1514                 float *end2 = dst + (size&~1)*4;
1515                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1516                 {
1517                         while (dst < end2)
1518                         {
1519                                 __m128 v = _mm_loadu_ps((const float *)src);
1520                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1521                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1522                                 dst += 8;
1523                                 src += 2*sizeof(float[2]);
1524                         }
1525                 }
1526                 else
1527                 {
1528                         while (dst < end2)
1529                         {
1530                                 __m128 v = _mm_load_ps((const float *)src);
1531                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1532                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1533                                 dst += 8;
1534                                 src += 2*sizeof(float[2]);
1535                         }
1536                 }
1537         }
1538         while (dst < end)
1539         {
1540                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1541                 dst += 4;
1542                 src += stride;
1543         }
1544 }
1545
1546 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1547 {
1548         float *end = dst + size*4;
1549         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1550         if (stride == sizeof(unsigned char[4]))
1551         {
1552                 float *end4 = dst + (size&~3)*4;
1553                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1554                 {
1555                         while (dst < end4)
1556                         {
1557                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1558                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1559                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1560                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1561                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1562                                 dst += 16;
1563                                 src += 4*sizeof(unsigned char[4]);
1564                         }
1565                 }
1566                 else
1567                 {
1568                         while (dst < end4)
1569                         {
1570                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1571                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1572                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1573                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1574                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1575                                 dst += 16;
1576                                 src += 4*sizeof(unsigned char[4]);
1577                         }
1578                 }
1579         }
1580         while (dst < end)
1581         {
1582                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1583                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1584                 dst += 4;
1585                 src += stride;
1586         }
1587 }
1588
1589 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1590 {
1591         float *end = dst + 4*size;
1592         __m128 v = _mm_loadu_ps(src);
1593         while (dst < end)
1594         {
1595                 _mm_store_ps(dst, v);
1596                 dst += 4;
1597         }
1598 }
1599 #endif
1600
1601 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1602 {
1603 #ifdef SSE2_PRESENT
1604         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1605         __m128 m0, m1, m2, m3;
1606         float *end;
1607         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1608         {
1609                 // fast case for identity matrix
1610                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1611                 return;
1612         }
1613         end = out4f + numitems*4;
1614         m0 = _mm_loadu_ps(inmatrix16f);
1615         m1 = _mm_loadu_ps(inmatrix16f + 4);
1616         m2 = _mm_loadu_ps(inmatrix16f + 8);
1617         m3 = _mm_loadu_ps(inmatrix16f + 12);
1618         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1619         {
1620                 while (out4f < end)
1621                 {
1622                         __m128 v = _mm_loadu_ps(in4f);
1623                         _mm_store_ps(out4f,
1624                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1625                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1626                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1627                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1628                         out4f += 4;
1629                         in4f += 4;
1630                 }
1631         }
1632         else
1633         {
1634                 while (out4f < end)
1635                 {
1636                         __m128 v = _mm_load_ps(in4f);
1637                         _mm_store_ps(out4f,
1638                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1642                         out4f += 4;
1643                         in4f += 4;
1644                 }
1645         }
1646 #endif
1647 }
1648
1649 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1650 {
1651         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1652 }
1653
1654 #ifdef SSE2_PRESENT
1655 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1656 { \
1657         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1658         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1659         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1660         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1661 }
1662
1663 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1664 { \
1665         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1666         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1667         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1668         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1669 }
1670
1671 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1672 { \
1673         __m128 p = (in); \
1674         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1675                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1676                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1677                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1678 }
1679
1680 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1681 {
1682         int clipmask = 0xFF;
1683         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1684         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1685         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1686         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1687         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1688         #define BBFRONT(k, pos) \
1689         { \
1690                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1691                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1692                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1693                 { \
1694                         __m128 proj; \
1695                         clipmask &= ~(1<<k); \
1696                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1697                         minproj = _mm_min_ss(minproj, proj); \
1698                         maxproj = _mm_max_ss(maxproj, proj); \
1699                 } \
1700         }
1701         BBFRONT(0, minpos); 
1702         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1703         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1704         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1705         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1706         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1707         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1708         BBFRONT(7, maxpos);
1709         #define BBCLIP(k) \
1710         { \
1711                 if (clipmask&(1<<k)) \
1712                 { \
1713                         if (!(clipmask&(1<<(k^1)))) \
1714                         { \
1715                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1716                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1717                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1718                                 minproj = _mm_min_ss(minproj, proj); \
1719                                 maxproj = _mm_max_ss(maxproj, proj); \
1720                         } \
1721                         if (!(clipmask&(1<<(k^2)))) \
1722                         { \
1723                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1724                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1725                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1726                                 minproj = _mm_min_ss(minproj, proj); \
1727                                 maxproj = _mm_max_ss(maxproj, proj); \
1728                         } \
1729                         if (!(clipmask&(1<<(k^4)))) \
1730                         { \
1731                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1732                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1733                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1734                                 minproj = _mm_min_ss(minproj, proj); \
1735                                 maxproj = _mm_max_ss(maxproj, proj); \
1736                         } \
1737                 } \
1738         }
1739         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1740         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1741         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1742         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1743         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1744         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1745         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1746         *starty = _mm_cvttss_si32(maxproj);
1747         *endy = _mm_cvttss_si32(minproj)+1;
1748         return clipmask;
1749 }
1750 #endif
1751         
1752 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1753 {
1754 #ifdef SSE2_PRESENT
1755         float *end = out4f + numitems*4;
1756         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1757         __m128 minpos, maxpos;
1758         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1759         {
1760                 minpos = maxpos = _mm_loadu_ps(in4f);
1761                 while (out4f < end)
1762                 {
1763                         __m128 v = _mm_loadu_ps(in4f);
1764                         minpos = _mm_min_ps(minpos, v);
1765                         maxpos = _mm_max_ps(maxpos, v);
1766                         _mm_store_ps(out4f, v);
1767                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1768                         _mm_store_ps(screen4f, v);
1769                         in4f += 4;
1770                         out4f += 4;
1771                         screen4f += 4;
1772                 }
1773         }
1774         else
1775         {
1776                 minpos = maxpos = _mm_load_ps(in4f);
1777                 while (out4f < end)
1778                 {
1779                         __m128 v = _mm_load_ps(in4f);
1780                         minpos = _mm_min_ps(minpos, v);
1781                         maxpos = _mm_max_ps(maxpos, v);
1782                         _mm_store_ps(out4f, v);
1783                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1784                         _mm_store_ps(screen4f, v);
1785                         in4f += 4;
1786                         out4f += 4;
1787                         screen4f += 4;
1788                 }
1789         }
1790         if (starty && endy) 
1791                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1792                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1793                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1794                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1795                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1796         return 0;
1797 #endif
1798 }
1799
1800 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1801 {
1802 #ifdef SSE2_PRESENT
1803         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1804         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1805         float *end;
1806         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1807                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1808         end = out4f + numitems*4;
1809         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1810         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1811         m0 = _mm_loadu_ps(inmatrix16f);
1812         m1 = _mm_loadu_ps(inmatrix16f + 4);
1813         m2 = _mm_loadu_ps(inmatrix16f + 8);
1814         m3 = _mm_loadu_ps(inmatrix16f + 12);
1815         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1816         {
1817                 minpos = maxpos = _mm_loadu_ps(in4f);
1818                 while (out4f < end)
1819                 {
1820                         __m128 v = _mm_loadu_ps(in4f);
1821                         minpos = _mm_min_ps(minpos, v);
1822                         maxpos = _mm_max_ps(maxpos, v);
1823                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1824                         _mm_store_ps(out4f, v);
1825                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1826                         _mm_store_ps(screen4f, v);
1827                         in4f += 4;
1828                         out4f += 4;
1829                         screen4f += 4;
1830                 }
1831         }
1832         else
1833         {
1834                 minpos = maxpos = _mm_load_ps(in4f);
1835                 while (out4f < end)
1836                 {
1837                         __m128 v = _mm_load_ps(in4f);
1838                         minpos = _mm_min_ps(minpos, v);
1839                         maxpos = _mm_max_ps(maxpos, v);
1840                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1841                         _mm_store_ps(out4f, v);
1842                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843                         _mm_store_ps(screen4f, v);
1844                         in4f += 4;
1845                         out4f += 4;
1846                         screen4f += 4;
1847                 }
1848         }
1849         if (starty && endy) 
1850                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1851         return 0;
1852 #endif
1853 }
1854
1855 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1856 {
1857         float *outf = dpsoftrast.post_array4f[outarray];
1858         const unsigned char *inb;
1859         int firstvertex = dpsoftrast.firstvertex;
1860         int numvertices = dpsoftrast.numvertices;
1861         int stride;
1862         switch(inarray)
1863         {
1864         case DPSOFTRAST_ARRAY_POSITION:
1865                 stride = dpsoftrast.stride_vertex;
1866                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1867                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1868                 break;
1869         case DPSOFTRAST_ARRAY_COLOR:
1870                 stride = dpsoftrast.stride_color;
1871                 if (dpsoftrast.pointer_color4f)
1872                 {
1873                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1874                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1875                 }
1876                 else if (dpsoftrast.pointer_color4ub)
1877                 {
1878                         stride = dpsoftrast.stride_color;
1879                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1880                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1881                 }
1882                 else
1883                 {
1884                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1885                 }
1886                 break;
1887         default:
1888                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1889                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1890                 {
1891                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1892                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1893                         {
1894                         case 2:
1895                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1896                                 break;
1897                         case 3:
1898                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1899                                 break;
1900                         case 4:
1901                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1902                                 break;
1903                         }
1904                 }
1905                 break;
1906         }
1907         return outf;
1908 }
1909
1910 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1911 {
1912         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1913         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1914         return data;
1915 }
1916
1917 #if 0
1918 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1919 {
1920         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1921         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1922         return data;
1923 }
1924 #endif
1925
1926 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1927 {
1928         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1929         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1930         return data;
1931 }
1932
1933 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1934 {
1935         int x;
1936         int startx = span->startx;
1937         int endx = span->endx;
1938         float wslope = triangle->w[0];
1939         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1940         float endz = 1.0f / (w + wslope * startx);
1941         for (x = startx;x < endx;)
1942         {
1943                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1944                 float z = endz, dz;
1945                 if(nextsub >= endx) nextsub = endsub = endx-1;
1946                 endz = 1.0f / (w + wslope * nextsub);
1947                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1948                 for (; x <= endsub; x++, z += dz)
1949                         zf[x] = z;
1950         }
1951 }
1952
1953 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1954 {
1955         int x;
1956         int startx = span->startx;
1957         int endx = span->endx;
1958         int d[4];
1959         float a, b;
1960         unsigned char * RESTRICT pixelmask = span->pixelmask;
1961         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1962         if (!pixel)
1963                 return;
1964         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1965         // handle alphatest now (this affects depth writes too)
1966         if (thread->alphatest)
1967                 for (x = startx;x < endx;x++)
1968                         if (in4f[x*4+3] < 0.5f)
1969                                 pixelmask[x] = false;
1970         // FIXME: this does not handle bigendian
1971         switch(thread->fb_blendmode)
1972         {
1973         case DPSOFTRAST_BLENDMODE_OPAQUE:
1974                 for (x = startx;x < endx;x++)
1975                 {
1976                         if (!pixelmask[x])
1977                                 continue;
1978                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1979                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1980                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1981                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1982                         pixel[x*4+0] = d[0];
1983                         pixel[x*4+1] = d[1];
1984                         pixel[x*4+2] = d[2];
1985                         pixel[x*4+3] = d[3];
1986                 }
1987                 break;
1988         case DPSOFTRAST_BLENDMODE_ALPHA:
1989                 for (x = startx;x < endx;x++)
1990                 {
1991                         if (!pixelmask[x])
1992                                 continue;
1993                         a = in4f[x*4+3] * 255.0f;
1994                         b = 1.0f - in4f[x*4+3];
1995                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
1996                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
1997                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
1998                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
1999                         pixel[x*4+0] = d[0];
2000                         pixel[x*4+1] = d[1];
2001                         pixel[x*4+2] = d[2];
2002                         pixel[x*4+3] = d[3];
2003                 }
2004                 break;
2005         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2006                 for (x = startx;x < endx;x++)
2007                 {
2008                         if (!pixelmask[x])
2009                                 continue;
2010                         a = in4f[x*4+3] * 255.0f;
2011                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2012                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2013                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2014                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2015                         pixel[x*4+0] = d[0];
2016                         pixel[x*4+1] = d[1];
2017                         pixel[x*4+2] = d[2];
2018                         pixel[x*4+3] = d[3];
2019                 }
2020                 break;
2021         case DPSOFTRAST_BLENDMODE_ADD:
2022                 for (x = startx;x < endx;x++)
2023                 {
2024                         if (!pixelmask[x])
2025                                 continue;
2026                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2027                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2028                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2029                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2030                         pixel[x*4+0] = d[0];
2031                         pixel[x*4+1] = d[1];
2032                         pixel[x*4+2] = d[2];
2033                         pixel[x*4+3] = d[3];
2034                 }
2035                 break;
2036         case DPSOFTRAST_BLENDMODE_INVMOD:
2037                 for (x = startx;x < endx;x++)
2038                 {
2039                         if (!pixelmask[x])
2040                                 continue;
2041                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2042                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2043                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2044                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2045                         pixel[x*4+0] = d[0];
2046                         pixel[x*4+1] = d[1];
2047                         pixel[x*4+2] = d[2];
2048                         pixel[x*4+3] = d[3];
2049                 }
2050                 break;
2051         case DPSOFTRAST_BLENDMODE_MUL:
2052                 for (x = startx;x < endx;x++)
2053                 {
2054                         if (!pixelmask[x])
2055                                 continue;
2056                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2057                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2058                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2059                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2060                         pixel[x*4+0] = d[0];
2061                         pixel[x*4+1] = d[1];
2062                         pixel[x*4+2] = d[2];
2063                         pixel[x*4+3] = d[3];
2064                 }
2065                 break;
2066         case DPSOFTRAST_BLENDMODE_MUL2:
2067                 for (x = startx;x < endx;x++)
2068                 {
2069                         if (!pixelmask[x])
2070                                 continue;
2071                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2072                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2073                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2074                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2075                         pixel[x*4+0] = d[0];
2076                         pixel[x*4+1] = d[1];
2077                         pixel[x*4+2] = d[2];
2078                         pixel[x*4+3] = d[3];
2079                 }
2080                 break;
2081         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2082                 for (x = startx;x < endx;x++)
2083                 {
2084                         if (!pixelmask[x])
2085                                 continue;
2086                         a = in4f[x*4+3] * -255.0f;
2087                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2088                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2089                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2090                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2091                         pixel[x*4+0] = d[0];
2092                         pixel[x*4+1] = d[1];
2093                         pixel[x*4+2] = d[2];
2094                         pixel[x*4+3] = d[3];
2095                 }
2096                 break;
2097         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2098                 for (x = startx;x < endx;x++)
2099                 {
2100                         if (!pixelmask[x])
2101                                 continue;
2102                         a = 255.0f;
2103                         b = 1.0f - in4f[x*4+3];
2104                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2105                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2106                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2107                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2108                         pixel[x*4+0] = d[0];
2109                         pixel[x*4+1] = d[1];
2110                         pixel[x*4+2] = d[2];
2111                         pixel[x*4+3] = d[3];
2112                 }
2113                 break;
2114         }
2115 }
2116
2117 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2118 {
2119 #ifdef SSE2_PRESENT
2120         int x;
2121         int startx = span->startx;
2122         int endx = span->endx;
2123         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2124         unsigned char * RESTRICT pixelmask = span->pixelmask;
2125         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2126         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2127         if (!pixel)
2128                 return;
2129         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2130         pixeli += span->y * dpsoftrast.fb_width + span->x;
2131         // handle alphatest now (this affects depth writes too)
2132         if (thread->alphatest)
2133                 for (x = startx;x < endx;x++)
2134                         if (in4ub[x*4+3] < 0.5f)
2135                                 pixelmask[x] = false;
2136         // FIXME: this does not handle bigendian
2137         switch(thread->fb_blendmode)
2138         {
2139         case DPSOFTRAST_BLENDMODE_OPAQUE:
2140                 for (x = startx;x + 4 <= endx;)
2141                 {
2142                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2143                         {
2144                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2145                                 x += 4;
2146                         }
2147                         else
2148                         {
2149                                 if (pixelmask[x])
2150                                         pixeli[x] = ini[x];
2151                                 x++;
2152                         }
2153                 }
2154                 for (;x < endx;x++)
2155                         if (pixelmask[x])
2156                                 pixeli[x] = ini[x];
2157                 break;
2158         case DPSOFTRAST_BLENDMODE_ALPHA:
2159         #define FINISHBLEND(blend2, blend1) \
2160                 for (x = startx;x + 2 <= endx;x += 2) \
2161                 { \
2162                         __m128i src, dst; \
2163                         switch (*(const unsigned short*)&pixelmask[x]) \
2164                         { \
2165                         case 0x0101: \
2166                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2167                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2168                                 blend2; \
2169                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2170                                 continue; \
2171                         case 0x0100: \
2172                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2173                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2174                                 blend1; \
2175                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2176                                 continue; \
2177                         case 0x0001: \
2178                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2179                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2180                                 blend1; \
2181                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2182                                 continue; \
2183                         } \
2184                         break; \
2185                 } \
2186                 for(;x < endx; x++) \
2187                 { \
2188                         __m128i src, dst; \
2189                         if (!pixelmask[x]) \
2190                                 continue; \
2191                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2192                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2193                         blend1; \
2194                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2195                 }
2196
2197                 FINISHBLEND({
2198                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2199                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2200                 }, {
2201                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2202                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2203                 });
2204                 break;
2205         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2206                 FINISHBLEND({
2207                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2208                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2209                 }, {
2210                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2211                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2212                 });
2213                 break;
2214         case DPSOFTRAST_BLENDMODE_ADD:
2215                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2216                 break;
2217         case DPSOFTRAST_BLENDMODE_INVMOD:
2218                 FINISHBLEND({
2219                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2220                 }, {
2221                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2222                 });
2223                 break;
2224         case DPSOFTRAST_BLENDMODE_MUL:
2225                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2226                 break;
2227         case DPSOFTRAST_BLENDMODE_MUL2:
2228                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2229                 break;
2230         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2231                 FINISHBLEND({
2232                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2233                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2234                 }, {
2235                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2236                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2237                 });
2238                 break;
2239         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2240                 FINISHBLEND({
2241                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2242                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2243                 }, {
2244                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2245                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2246                 });
2247                 break;
2248         }
2249 #endif
2250 }
2251
2252 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2253 {
2254         int x;
2255         int startx = span->startx;
2256         int endx = span->endx;
2257         int flags;
2258         float c[4];
2259         float data[4];
2260         float slope[4];
2261         float tc[2], endtc[2];
2262         float tcscale[2];
2263         unsigned int tci[2];
2264         unsigned int tci1[2];
2265         unsigned int tcimin[2];
2266         unsigned int tcimax[2];
2267         int tciwrapmask[2];
2268         int tciwidth;
2269         int filter;
2270         int mip;
2271         const unsigned char * RESTRICT pixelbase;
2272         const unsigned char * RESTRICT pixel[4];
2273         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2274         // if no texture is bound, just fill it with white
2275         if (!texture)
2276         {
2277                 for (x = startx;x < endx;x++)
2278                 {
2279                         out4f[x*4+0] = 1.0f;
2280                         out4f[x*4+1] = 1.0f;
2281                         out4f[x*4+2] = 1.0f;
2282                         out4f[x*4+3] = 1.0f;
2283                 }
2284                 return;
2285         }
2286         mip = triangle->mip[texunitindex];
2287         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2288         // if this mipmap of the texture is 1 pixel, just fill it with that color
2289         if (texture->mipmap[mip][1] == 4)
2290         {
2291                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2292                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2293                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2294                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2295                 for (x = startx;x < endx;x++)
2296                 {
2297                         out4f[x*4+0] = c[0];
2298                         out4f[x*4+1] = c[1];
2299                         out4f[x*4+2] = c[2];
2300                         out4f[x*4+3] = c[3];
2301                 }
2302                 return;
2303         }
2304         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2305         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2306         flags = texture->flags;
2307         tcscale[0] = texture->mipmap[mip][2];
2308         tcscale[1] = texture->mipmap[mip][3];
2309         tciwidth = texture->mipmap[mip][2];
2310         tcimin[0] = 0;
2311         tcimin[1] = 0;
2312         tcimax[0] = texture->mipmap[mip][2]-1;
2313         tcimax[1] = texture->mipmap[mip][3]-1;
2314         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2315         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2316         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2317         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2318         for (x = startx;x < endx;)
2319         {
2320                 unsigned int subtc[2];
2321                 unsigned int substep[2];
2322                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2323                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2324                 if(nextsub >= endx)
2325                 {
2326                         nextsub = endsub = endx-1;      
2327                         if(x < nextsub) subscale = 65536.0f / (nextsub - x);
2328                 }
2329                 tc[0] = endtc[0];
2330                 tc[1] = endtc[1];
2331                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2332                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2333                 substep[0] = (endtc[0] - tc[0]) * subscale;
2334                 substep[1] = (endtc[1] - tc[1]) * subscale;
2335                 subtc[0] = tc[0] * (1<<16);
2336                 subtc[1] = tc[1] * (1<<16);
2337                 if(filter)
2338                 {
2339                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2340                         {
2341                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2342                                 {
2343                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2344                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2345                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2346                                         tci[0] = subtc[0]>>16;
2347                                         tci[1] = subtc[1]>>16;
2348                                         tci1[0] = tci[0] + 1;
2349                                         tci1[1] = tci[1] + 1;
2350                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2351                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2352                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2353                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2354                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2355                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2356                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2357                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2358                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2359                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2360                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2361                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2362                                         out4f[x*4+0] = c[0];
2363                                         out4f[x*4+1] = c[1];
2364                                         out4f[x*4+2] = c[2];
2365                                         out4f[x*4+3] = c[3];
2366                                 }
2367                         }
2368                         else
2369                         {
2370                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2371                                 {
2372                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2373                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2374                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2375                                         tci[0] = subtc[0]>>16;
2376                                         tci[1] = subtc[1]>>16;
2377                                         tci1[0] = tci[0] + 1;
2378                                         tci1[1] = tci[1] + 1;
2379                                         tci[0] &= tciwrapmask[0];
2380                                         tci[1] &= tciwrapmask[1];
2381                                         tci1[0] &= tciwrapmask[0];
2382                                         tci1[1] &= tciwrapmask[1];
2383                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2384                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2385                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2386                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2387                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2388                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2389                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2390                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2391                                         out4f[x*4+0] = c[0];
2392                                         out4f[x*4+1] = c[1];
2393                                         out4f[x*4+2] = c[2];
2394                                         out4f[x*4+3] = c[3];
2395                                 }
2396                         }
2397                 }
2398                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2399                 {
2400                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2401                         {
2402                                 tci[0] = subtc[0]>>16;
2403                                 tci[1] = subtc[1]>>16;
2404                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2405                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2406                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2407                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2408                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2409                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2410                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2411                                 out4f[x*4+0] = c[0];
2412                                 out4f[x*4+1] = c[1];
2413                                 out4f[x*4+2] = c[2];
2414                                 out4f[x*4+3] = c[3];
2415                         }
2416                 }
2417                 else
2418                 {
2419                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2420                         {
2421                                 tci[0] = subtc[0]>>16;
2422                                 tci[1] = subtc[1]>>16;
2423                                 tci[0] &= tciwrapmask[0];
2424                                 tci[1] &= tciwrapmask[1];
2425                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2426                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2427                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2428                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2429                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2430                                 out4f[x*4+0] = c[0];
2431                                 out4f[x*4+1] = c[1];
2432                                 out4f[x*4+2] = c[2];
2433                                 out4f[x*4+3] = c[3];
2434                         }
2435                 }
2436         }
2437 }
2438
2439 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2440 {
2441 #ifdef SSE2_PRESENT
2442         int x;
2443         int startx = span->startx;
2444         int endx = span->endx;
2445         int flags;
2446         __m128 data, slope, tcscale;
2447         __m128i tcsize, tcmask, tcoffset, tcmax;
2448         __m128 tc, endtc;
2449         __m128i subtc, substep, endsubtc;
2450         int filter;
2451         int mip;
2452         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2453         const unsigned char * RESTRICT pixelbase;
2454         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2455         // if no texture is bound, just fill it with white
2456         if (!texture)
2457         {
2458                 memset(out4ub + startx*4, 255, span->length*4);
2459                 return;
2460         }
2461         mip = triangle->mip[texunitindex];
2462         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2463         // if this mipmap of the texture is 1 pixel, just fill it with that color
2464         if (texture->mipmap[mip][1] == 4)
2465         {
2466                 unsigned int k = *((const unsigned int *)pixelbase);
2467                 for (x = startx;x < endx;x++)
2468                         outi[x] = k;
2469                 return;
2470         }
2471         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2472         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2473         flags = texture->flags;
2474         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2475         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2476         tcscale = _mm_cvtepi32_ps(tcsize);
2477         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2478         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2479         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2480         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2481         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2482         tcmax = _mm_packs_epi32(tcmask, tcmask);
2483         for (x = startx;x < endx;)
2484         {
2485                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2486                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2487                 if(nextsub >= endx)
2488                 {
2489                         nextsub = endsub = endx-1;
2490                         if(x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2491                 }       
2492                 tc = endtc;
2493                 subtc = endsubtc;
2494                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2495                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2496                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2497                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2498                 substep = _mm_slli_epi32(substep, 1);
2499                 if (filter)
2500                 {
2501                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2502                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2503                         {
2504                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2505                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2506                                 {
2507                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2508                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2509                                         tci = _mm_madd_epi16(tci, tcoffset);
2510                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2511                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2512                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2513                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2514                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2515                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2516                                         fracm = _mm_srli_epi16(subtc, 1);
2517                                         pix1 = _mm_add_epi16(pix1,
2518                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2519                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2520                                         pix3 = _mm_add_epi16(pix3,
2521                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2522                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2523                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2524                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2525                                         pix2 = _mm_add_epi16(pix2,
2526                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2527                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2528                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2529                                 }
2530                                 if (x <= endsub)
2531                                 {
2532                                         const unsigned char * RESTRICT ptr1;
2533                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2534                                         tci = _mm_madd_epi16(tci, tcoffset);
2535                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2536                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2537                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2538                                         fracm = _mm_srli_epi16(subtc, 1);
2539                                         pix1 = _mm_add_epi16(pix1,
2540                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2541                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2542                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2543                                         pix1 = _mm_add_epi16(pix1,
2544                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2545                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2546                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2547                                         x++;
2548                                 }
2549                         }
2550                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2551                         {
2552                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2553                                 {
2554                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2555                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2556                                         tci = _mm_madd_epi16(tci, tcoffset);
2557                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2558                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2559                                                                                         _mm_setzero_si128());
2560                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2561                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2562                                                                                         _mm_setzero_si128());
2563                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2564                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2565                                         tci = _mm_madd_epi16(tci, tcoffset);
2566                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2567                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2568                                                                                         _mm_setzero_si128());
2569                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2570                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2571                                                                                         _mm_setzero_si128());
2572                                         fracm = _mm_srli_epi16(subtc, 1);
2573                                         pix1 = _mm_add_epi16(pix1,
2574                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2575                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2576                                         pix3 = _mm_add_epi16(pix3,
2577                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2578                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2579                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2580                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2581                                         pix2 = _mm_add_epi16(pix2,
2582                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2583                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2584                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2585                                 }
2586                                 if (x <= endsub)
2587                                 {
2588                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2589                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2590                                         tci = _mm_madd_epi16(tci, tcoffset);
2591                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2592                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2593                                                                                         _mm_setzero_si128());
2594                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2595                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2596                                                                                         _mm_setzero_si128());
2597                                         fracm = _mm_srli_epi16(subtc, 1);
2598                                         pix1 = _mm_add_epi16(pix1,
2599                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2600                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2601                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2602                                         pix1 = _mm_add_epi16(pix1,
2603                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2604                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2605                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2606                                         x++;
2607                                 }
2608                         }
2609                         else
2610                         {
2611                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2612                                 {
2613                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2614                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2615                                         tci = _mm_madd_epi16(tci, tcoffset);
2616                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2617                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2618                                                                                         _mm_setzero_si128());
2619                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2620                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2621                                                                                         _mm_setzero_si128());
2622                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2623                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2624                                         tci = _mm_madd_epi16(tci, tcoffset);
2625                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2626                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2627                                                                                         _mm_setzero_si128());
2628                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2629                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2630                                                                                         _mm_setzero_si128());
2631                                         fracm = _mm_srli_epi16(subtc, 1);
2632                                         pix1 = _mm_add_epi16(pix1,
2633                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2634                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2635                                         pix3 = _mm_add_epi16(pix3,
2636                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2637                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2638                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2639                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2640                                         pix2 = _mm_add_epi16(pix2,
2641                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2642                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2643                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2644                                 }
2645                                 if (x <= endsub)
2646                                 {
2647                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2648                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2649                                         tci = _mm_madd_epi16(tci, tcoffset);
2650                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2651                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2652                                                                                         _mm_setzero_si128());
2653                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2654                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2655                                                                                         _mm_setzero_si128());
2656                                         fracm = _mm_srli_epi16(subtc, 1);
2657                                         pix1 = _mm_add_epi16(pix1,
2658                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2659                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2660                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2661                                         pix1 = _mm_add_epi16(pix1,
2662                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2663                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2664                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2665                                         x++;
2666                                 }
2667                         }
2668                 }
2669                 else
2670                 {
2671                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2672                         {
2673                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2674                                 {
2675                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2676                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2677                                         tci = _mm_madd_epi16(tci, tcoffset);
2678                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2679                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2680                                 }
2681                                 if (x <= endsub)
2682                                 {
2683                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2684                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2685                                         tci = _mm_madd_epi16(tci, tcoffset);
2686                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2687                                         x++;
2688                                 }
2689                         }
2690                         else
2691                         {
2692                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2693                                 {
2694                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2695                                         tci = _mm_and_si128(tci, tcmax); 
2696                                         tci = _mm_madd_epi16(tci, tcoffset);
2697                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2698                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2699                                 }
2700                                 if (x <= endsub)
2701                                 {
2702                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2703                                         tci = _mm_and_si128(tci, tcmax); 
2704                                         tci = _mm_madd_epi16(tci, tcoffset);
2705                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2706                                         x++;
2707                                 }
2708                         }
2709                 }
2710         }
2711 #endif
2712 }
2713
2714 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2715 {
2716         // TODO: IMPLEMENT
2717         memset(out4ub, 255, span->length*4);
2718 }
2719
2720 float DPSOFTRAST_SampleShadowmap(const float *vector)
2721 {
2722         // TODO: IMPLEMENT
2723         return 1.0f;
2724 }
2725
2726 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2727 {
2728         int x;
2729         int startx = span->startx;
2730         int endx = span->endx;
2731         float c[4];
2732         float data[4];
2733         float slope[4];
2734         float z;
2735         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2736         for (x = startx;x < endx;x++)
2737         {
2738                 z = zf[x];
2739                 c[0] = (data[0] + slope[0]*x) * z;
2740                 c[1] = (data[1] + slope[1]*x) * z;
2741                 c[2] = (data[2] + slope[2]*x) * z;
2742                 c[3] = (data[3] + slope[3]*x) * z;
2743                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2744                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2745                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2746                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2747         }
2748 }
2749
2750 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2751 {
2752         int x;
2753         int startx = span->startx;
2754         int endx = span->endx;
2755         float c[4];
2756         float data[4];
2757         float slope[4];
2758         float z;
2759         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2760         for (x = startx;x < endx;x++)
2761         {
2762                 z = zf[x];
2763                 c[0] = (data[0] + slope[0]*x) * z;
2764                 c[1] = (data[1] + slope[1]*x) * z;
2765                 c[2] = (data[2] + slope[2]*x) * z;
2766                 c[3] = (data[3] + slope[3]*x) * z;
2767                 out4f[x*4+0] = c[0];
2768                 out4f[x*4+1] = c[1];
2769                 out4f[x*4+2] = c[2];
2770                 out4f[x*4+3] = c[3];
2771         }
2772 }
2773
2774 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2775 {
2776         int x, startx = span->startx, endx = span->endx;
2777         float c[4], localcolor[4];
2778         localcolor[0] = subcolor[0];
2779         localcolor[1] = subcolor[1];
2780         localcolor[2] = subcolor[2];
2781         localcolor[3] = subcolor[3];
2782         for (x = startx;x < endx;x++)
2783         {
2784                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2785                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2786                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2787                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2788                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2789                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2790                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2791                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2792         }
2793 }
2794
2795 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2796 {
2797         int x, startx = span->startx, endx = span->endx;
2798         for (x = startx;x < endx;x++)
2799         {
2800                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2801                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2802                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2803                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2804         }
2805 }
2806
2807 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2808 {
2809         int x, startx = span->startx, endx = span->endx;
2810         for (x = startx;x < endx;x++)
2811         {
2812                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2813                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2814                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2815                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2816         }
2817 }
2818
2819 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2820 {
2821         int x, startx = span->startx, endx = span->endx;
2822         float a, b;
2823         for (x = startx;x < endx;x++)
2824         {
2825                 a = 1.0f - inb4f[x*4+3];
2826                 b = inb4f[x*4+3];
2827                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2828                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2829                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2830                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2831         }
2832 }
2833
2834 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2835 {
2836         int x, startx = span->startx, endx = span->endx;
2837         float localcolor[4], ilerp, lerp;
2838         localcolor[0] = color[0];
2839         localcolor[1] = color[1];
2840         localcolor[2] = color[2];
2841         localcolor[3] = color[3];
2842         ilerp = 1.0f - localcolor[3];
2843         lerp = localcolor[3];
2844         for (x = startx;x < endx;x++)
2845         {
2846                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2847                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2848                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2849                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2850         }
2851 }
2852
2853
2854
2855 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2856 {
2857 #ifdef SSE2_PRESENT
2858         int x;
2859         int startx = span->startx;
2860         int endx = span->endx;
2861         __m128 data, slope;
2862         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2863         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2864         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2865         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2866         data = _mm_mul_ps(data, _mm_set1_ps(256.0f));
2867         slope = _mm_mul_ps(slope, _mm_set1_ps(256.0f));
2868         for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2869         {
2870                 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2871                 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), mod2;
2872                 data = _mm_add_ps(data, slope);
2873                 mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2874                 mod = _mm_unpacklo_epi64(_mm_packs_epi32(mod, mod), _mm_packs_epi32(mod2, mod2));
2875                 pix = _mm_mulhi_epu16(pix, mod);
2876                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2877         }
2878         for (;x < endx;x++, data = _mm_add_ps(data, slope))
2879         {
2880                 __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2881                 __m128i mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2882                 mod = _mm_packs_epi32(mod, mod);
2883                 pix = _mm_mulhi_epu16(pix, mod);
2884                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2885         }
2886 #endif
2887 }
2888
2889 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2890 {
2891 #ifdef SSE2_PRESENT
2892         int x;
2893         int startx = span->startx;
2894         int endx = span->endx;
2895         __m128 data, slope;
2896         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2897         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2898         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2899         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
2900         data = _mm_mul_ps(data, _mm_set1_ps(255.0f));
2901         slope = _mm_mul_ps(slope, _mm_set1_ps(255.0f));
2902         for (x = startx;x+2 <= endx;x += 2, data = _mm_add_ps(data, slope))
2903         {
2904                 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x]))), pix2;
2905                 data = _mm_add_ps(data, slope);
2906                 pix2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x+1])));
2907                 pix = _mm_unpacklo_epi64(_mm_packs_epi32(pix, pix), _mm_packs_epi32(pix2, pix2));
2908                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2909         }
2910         for (;x < endx;x++, data = _mm_add_ps(data, slope))
2911         {
2912                 __m128i pix = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&zf[x])));
2913                 pix = _mm_packs_epi32(pix, pix);
2914                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2915         }
2916 #endif
2917 }
2918
2919 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2920 {
2921 #ifdef SSE2_PRESENT
2922         int x, startx = span->startx, endx = span->endx;
2923         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2924         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2925         for (x = startx;x+2 <= endx;x+=2)
2926         {
2927                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2928                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2929                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2930                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2931         }
2932         if(x < endx)
2933         {
2934                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2935                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2936                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2937                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2938         }
2939 #endif
2940 }
2941
2942 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2943 {
2944 #ifdef SSE2_PRESENT
2945         int x, startx = span->startx, endx = span->endx;
2946         for (x = startx;x+2 <= endx;x+=2)
2947         {
2948                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2949                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2950                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2951                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2952         }
2953         if(x < endx)
2954         {
2955                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2956                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
2957                 pix1 = _mm_mulhi_epu16(pix1, pix2);
2958                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2959         }
2960 #endif
2961 }
2962
2963 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
2964 {
2965 #ifdef SSE2_PRESENT
2966         int x, startx = span->startx, endx = span->endx;
2967         for (x = startx;x+2 <= endx;x+=2)
2968         {
2969                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2970                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2971                 pix1 = _mm_add_epi16(pix1, pix2);
2972                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2973         }
2974         if(x < endx)
2975         {
2976                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2977                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2978                 pix1 = _mm_add_epi16(pix1, pix2);
2979                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2980         }
2981 #endif
2982 }
2983
2984 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
2985 {
2986 #ifdef SSE2_PRESENT
2987         int x, startx = span->startx, endx = span->endx;
2988         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
2989         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
2990         for (x = startx;x+2 <= endx;x+=2)
2991         {
2992                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2993                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
2994                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
2995                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2996         }
2997         if(x < endx)
2998         {
2999                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3000                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3001                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3002                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3003         }
3004 #endif
3005 }
3006
3007 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3008 {
3009 #ifdef SSE2_PRESENT
3010         int x, startx = span->startx, endx = span->endx;
3011         for (x = startx;x+2 <= endx;x+=2)
3012         {
3013                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3014                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3015                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3016                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3017                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3018         }
3019         if(x < endx)
3020         {
3021                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3022                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3023                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3024                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3025                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3026         }
3027 #endif
3028 }
3029
3030 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3031 {
3032 #ifdef SSE2_PRESENT
3033         int x, startx = span->startx, endx = span->endx;
3034         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3035         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3036         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3037         for (x = startx;x+2 <= endx;x+=2)
3038         {
3039                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3040                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3041                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3042         }
3043         if(x < endx)
3044         {
3045                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3046                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3047                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3048         }
3049 #endif
3050 }
3051
3052
3053
3054 void DPSOFTRAST_VertexShader_Generic(void)
3055 {
3056         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3057         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3058         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3059         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3060                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3061 }
3062
3063 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3064 {
3065         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3066         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3067         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3068         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3069         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3070         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3071         {
3072                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3073                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3074                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3075                 {
3076                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3077                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3078                         {
3079                                 // multiply
3080                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3081                         }
3082                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3083                         {
3084                                 // add
3085                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3086                         }
3087                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3088                         {
3089                                 // alphablend
3090                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3091                         }
3092                 }
3093         }
3094         else
3095                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3096         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3097 }
3098
3099
3100
3101 void DPSOFTRAST_VertexShader_PostProcess(void)
3102 {
3103         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3104         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3105         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3106 }
3107
3108 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3109 {
3110         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3111         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3112         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3113         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3114         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3115         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3116         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3117         {
3118                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3119                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3120         }
3121         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3122         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3123         {
3124                 // TODO: implement saturation
3125         }
3126         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3127         {
3128                 // TODO: implement gammaramps
3129         }
3130         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3131 }
3132
3133
3134
3135 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3136 {
3137         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3138 }
3139
3140 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3141 {
3142         // this is never called (because colormask is off when this shader is used)
3143         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3144         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3145         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3146         memset(buffer_FragColorbgra8, 0, span->length*4);
3147         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3148 }
3149
3150
3151
3152 void DPSOFTRAST_VertexShader_FlatColor(void)
3153 {
3154         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3155         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3156 }
3157
3158 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3159 {
3160         int x, startx = span->startx, endx = span->endx;
3161         int Color_Ambienti[4];
3162         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3163         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3164         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3165         Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3166         Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3167         Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3168         Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]        *256.0f);
3169         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3170         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3171         for (x = startx;x < endx;x++)
3172         {
3173                 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3174                 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3175                 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3176                 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3177         }
3178         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3179 }
3180
3181
3182
3183 void DPSOFTRAST_VertexShader_VertexColor(void)
3184 {
3185         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3186         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3187         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3188 }
3189
3190 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3191 {
3192 #ifdef SSE2_PRESENT
3193         unsigned char * RESTRICT pixelmask = span->pixelmask;
3194         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3195         int x, startx = span->startx, endx = span->endx;
3196         __m128i Color_Ambientm, Color_Diffusem;
3197         __m128 data, slope;
3198         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3199         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3200         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3201         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3202         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3203         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3204         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3205                 pixel = buffer_FragColorbgra8;
3206         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3207         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3208         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3209         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3210         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3211         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3212         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3213         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3214         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3215         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3216         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3217         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3218         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3219         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3220         {
3221                 __m128i color, mod, pix;
3222                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3223                 {
3224                         __m128i pix2, mod2;
3225                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3226                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3227                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3228                         data = _mm_add_ps(data, slope);
3229                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3230                         data = _mm_add_ps(data, slope);
3231                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3232                         data = _mm_add_ps(data, slope);
3233                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3234                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3235                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3236                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3237                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3238                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3239                         x += 3;
3240                         continue;
3241                 }
3242                 if(!pixelmask[x])
3243                         continue;
3244                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3245                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3246                 mod = _mm_packs_epi32(mod, mod);
3247                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3248                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3249         }
3250         if(pixel == buffer_FragColorbgra8)
3251                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3252 #endif
3253 }
3254
3255
3256
3257 void DPSOFTRAST_VertexShader_Lightmap(void)
3258 {
3259         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3260         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3261         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3262 }
3263
3264 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3265 {
3266 #ifdef SSE2_PRESENT
3267         unsigned char * RESTRICT pixelmask = span->pixelmask;
3268         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3269         int x, startx = span->startx, endx = span->endx;
3270         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3271         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3272         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3273         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3274         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3275         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3276         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3277         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3278         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3279         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3280                 pixel = buffer_FragColorbgra8;
3281         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3282         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3283         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3284         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3285         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3286         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3287         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3288         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3289         {
3290                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3291                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3292                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3293                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3294                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3295                 for (x = startx;x < endx;x++)
3296                 {
3297                         __m128i color, lightmap, glow, pix;
3298                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3299                         {
3300                                 __m128i pix2;
3301                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3302                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3303                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3304                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3305                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3306                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3307                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3308                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3309                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3310                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3311                                 x += 3;
3312                                 continue;
3313                         }
3314                         if(!pixelmask[x])
3315                                 continue;
3316                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3317                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3318                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3319                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3320                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3321                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3322                 }
3323         }
3324         else
3325         {
3326                 for (x = startx;x < endx;x++)
3327                 {
3328                         __m128i color, lightmap, pix;
3329                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3330                         {
3331                                 __m128i pix2;
3332                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3333                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3334                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3335                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3336                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3337                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3338                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3339                                 x += 3;
3340                                 continue;
3341                         }
3342                         if(!pixelmask[x]) 
3343                                 continue;
3344                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3345                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3346                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3347                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3348                 }
3349         }
3350         if(pixel == buffer_FragColorbgra8)
3351                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3352 #endif
3353 }
3354
3355
3356
3357 void DPSOFTRAST_VertexShader_FakeLight(void)
3358 {
3359         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3360 }
3361
3362 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3363 {
3364         // TODO: IMPLEMENT
3365         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3366         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3367         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3368         memset(buffer_FragColorbgra8, 0, span->length*4);
3369         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3370 }
3371
3372
3373
3374 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3375 {
3376         DPSOFTRAST_VertexShader_Lightmap();
3377 }
3378
3379 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3380 {
3381         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3382         // TODO: IMPLEMENT
3383 }
3384
3385
3386
3387 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3388 {
3389         DPSOFTRAST_VertexShader_Lightmap();
3390 }
3391
3392 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3393 {
3394         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3395         // TODO: IMPLEMENT
3396 }
3397
3398
3399
3400 void DPSOFTRAST_VertexShader_LightDirection(void)
3401 {
3402         int i;
3403         int numvertices = dpsoftrast.numvertices;
3404         float LightDir[4];
3405         float LightVector[4];
3406         float EyePosition[4];
3407         float EyeVectorModelSpace[4];
3408         float EyeVector[4];
3409         float position[4];
3410         float svector[4];
3411         float tvector[4];
3412         float normal[4];
3413         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3414         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3415         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3416         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3417         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3418         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3419         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3420         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3421         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3422         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3423         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3424         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3425         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3426         for (i = 0;i < numvertices;i++)
3427         {
3428                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3429                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3430                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3431                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3432                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3433                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3434                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3435                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3436                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3437                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3438                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3439                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3440                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3441                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3442                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3443                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3444                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3445                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3446                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3447                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3448                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3449                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3450                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3451                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3452                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3453                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3454                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3455                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3456                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3457         }
3458         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3459 }
3460
3461 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3462 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3463 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3464 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3465 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3466 #define DPSOFTRAST_Vector3Normalize(v)\
3467 do\
3468 {\
3469         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3470         if (len)\
3471         {\
3472                 len = 1.0f / len;\
3473                 v[0] *= len;\
3474                 v[1] *= len;\
3475                 v[2] *= len;\
3476         }\
3477 }\
3478 while(0)
3479
3480 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3481 {
3482         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3483         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3484         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3485         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3486         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3487         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3488         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3489         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3490         int x, startx = span->startx, endx = span->endx;
3491         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3492         float LightVectordata[4];
3493         float LightVectorslope[4];
3494         float EyeVectordata[4];
3495         float EyeVectorslope[4];
3496         float z;
3497         float diffusetex[4];
3498         float glosstex[4];
3499         float surfacenormal[4];
3500         float lightnormal[4];
3501         float eyenormal[4];
3502         float specularnormal[4];
3503         float diffuse;
3504         float specular;
3505         float SpecularPower;
3506         int d[4];
3507         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3508         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3509         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3510         Color_Glow[3] = 0.0f;
3511         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3512         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3513         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3514         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3515         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3516         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3517         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3518         Color_Pants[3] = 0.0f;
3519         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3520         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3521         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3522         Color_Shirt[3] = 0.0f;
3523         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3524         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3525         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3526         {
3527                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3528                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3529         }
3530         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3531         {
3532                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3533         }
3534         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3535         {
3536                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3537                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3538                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3539                 Color_Diffuse[3] = 0.0f;
3540                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3541                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3542                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3543                 LightColor[3] = 0.0f;
3544                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3545                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3546                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3547                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3548                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3549                 Color_Specular[3] = 0.0f;
3550                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3551                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3552                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3553                 for (x = startx;x < endx;x++)
3554                 {
3555                         z = buffer_z[x];
3556                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3557                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3558                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3559                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3560                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3561                         {
3562                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3563                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3564                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3565                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3566                         }
3567                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3568                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3569                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3570                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3571                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3572                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3573                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3574                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3575
3576                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3577                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3578                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3579                         DPSOFTRAST_Vector3Normalize(lightnormal);
3580
3581                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3582                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3583                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3584                         DPSOFTRAST_Vector3Normalize(eyenormal);
3585
3586                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3587                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3588                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3589                         DPSOFTRAST_Vector3Normalize(specularnormal);
3590
3591                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3592                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3593                         specular = pow(specular, SpecularPower * glosstex[3]);
3594                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3595                         {
3596                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3597                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3598                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3599                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3600                         }
3601                         else
3602                         {
3603                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3604                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3605                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3606                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3607                         }
3608                         buffer_FragColorbgra8[x*4+0] = d[0];
3609                         buffer_FragColorbgra8[x*4+1] = d[1];
3610                         buffer_FragColorbgra8[x*4+2] = d[2];
3611                         buffer_FragColorbgra8[x*4+3] = d[3];
3612                 }
3613         }
3614         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3615         {
3616                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3617                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3618                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3619                 Color_Diffuse[3] = 0.0f;
3620                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3621                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3622                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3623                 LightColor[3] = 0.0f;
3624                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3625                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3626                 for (x = startx;x < endx;x++)
3627                 {
3628                         z = buffer_z[x];
3629                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3630                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3631                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3632                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3633                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3634                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3635                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3636                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3637
3638                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3639                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3640                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3641                         DPSOFTRAST_Vector3Normalize(lightnormal);
3642
3643                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3644                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3645                         {
3646                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3647                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3648                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3649                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3650                         }
3651                         else
3652                         {
3653                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3654                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3655                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3656                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3657                         }
3658                         buffer_FragColorbgra8[x*4+0] = d[0];
3659                         buffer_FragColorbgra8[x*4+1] = d[1];
3660                         buffer_FragColorbgra8[x*4+2] = d[2];
3661                         buffer_FragColorbgra8[x*4+3] = d[3];
3662                 }
3663         }
3664         else
3665         {
3666                 for (x = startx;x < endx;x++)
3667                 {
3668                         z = buffer_z[x];
3669                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3670                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3671                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3672                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3673
3674                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3675                         {
3676                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3677                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3678                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3679                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3680                         }
3681                         else
3682                         {
3683                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3684                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3685                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3686                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3687                         }
3688                         buffer_FragColorbgra8[x*4+0] = d[0];
3689                         buffer_FragColorbgra8[x*4+1] = d[1];
3690                         buffer_FragColorbgra8[x*4+2] = d[2];
3691                         buffer_FragColorbgra8[x*4+3] = d[3];
3692                 }
3693         }
3694         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3695 }
3696
3697
3698
3699 void DPSOFTRAST_VertexShader_LightSource(void)
3700 {
3701         int i;
3702         int numvertices = dpsoftrast.numvertices;
3703         float LightPosition[4];
3704         float LightVector[4];
3705         float LightVectorModelSpace[4];
3706         float EyePosition[4];
3707         float EyeVectorModelSpace[4];
3708         float EyeVector[4];
3709         float position[4];
3710         float svector[4];
3711         float tvector[4];
3712         float normal[4];
3713         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3714         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3715         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3716         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3717         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3718         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3719         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3720         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3721         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3722         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3723         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3724         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3725         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3726         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3727         for (i = 0;i < numvertices;i++)
3728         {
3729                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3730                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3731                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3732                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3733                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3734                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3735                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3736                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3737                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3738                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3739                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3740                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3741                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3742                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3743                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3744                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3745                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3746                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3747                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3748                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3749                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3750                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3751                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3752                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3753                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3754                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3755                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3756                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3757                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3758                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3759                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3760                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3761         }
3762         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3763         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3764 }
3765
3766 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3767 {
3768 #ifdef SSE2_PRESENT
3769         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3770         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3771         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3772         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3773         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3774         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3775         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3776         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3777         int x, startx = span->startx, endx = span->endx;
3778         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3779         float CubeVectordata[4];
3780         float CubeVectorslope[4];
3781         float LightVectordata[4];
3782         float LightVectorslope[4];
3783         float EyeVectordata[4];
3784         float EyeVectorslope[4];
3785         float z;
3786         float diffusetex[4];
3787         float glosstex[4];
3788         float surfacenormal[4];
3789         float lightnormal[4];
3790         float eyenormal[4];
3791         float specularnormal[4];
3792         float diffuse;
3793         float specular;
3794         float SpecularPower;
3795         float CubeVector[4];
3796         float attenuation;
3797         int d[4];
3798         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3799         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3800         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3801         Color_Glow[3] = 0.0f;
3802         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3803         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3804         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3805         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3806         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3807         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3808         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3809         Color_Diffuse[3] = 0.0f;
3810         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3811         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3812         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3813         Color_Specular[3] = 0.0f;
3814         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3815         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3816         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3817         Color_Pants[3] = 0.0f;
3818         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3819         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3820         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3821         Color_Shirt[3] = 0.0f;
3822         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3823         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3824         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3825         LightColor[3] = 0.0f;
3826         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3827         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3828         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3829         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3830         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3831         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3832         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3833         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3834         {
3835                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3836                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3837         }
3838         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3839                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3840         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3841         {
3842                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3843                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3844                 for (x = startx;x < endx;x++)
3845                 {
3846                         z = buffer_z[x];
3847                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3848                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3849                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3850                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3851                         if (attenuation < 0.01f)
3852                                 continue;
3853                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3854                         {
3855                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3856                                 if (attenuation < 0.01f)
3857                                         continue;
3858                         }
3859
3860                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3861                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3862                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3863                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3864                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3865                         {
3866                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3867                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3868                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3869                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3870                         }
3871                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3872                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3873                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3874                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3875                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3876                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3877                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3878                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3879
3880                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3881                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3882                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3883                         DPSOFTRAST_Vector3Normalize(lightnormal);
3884
3885                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3886                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3887                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3888                         DPSOFTRAST_Vector3Normalize(eyenormal);
3889
3890                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3891                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3892                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3893                         DPSOFTRAST_Vector3Normalize(specularnormal);
3894
3895                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3896                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3897                         specular = pow(specular, SpecularPower * glosstex[3]);
3898                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3899                         {
3900                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3901                                 attenuation *= (1.0f / 255.0f);
3902                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3903                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3904                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3905                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3906                         }
3907                         else
3908                         {
3909                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3910                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3911                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3912                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3913                         }
3914                         buffer_FragColorbgra8[x*4+0] = d[0];
3915                         buffer_FragColorbgra8[x*4+1] = d[1];
3916                         buffer_FragColorbgra8[x*4+2] = d[2];
3917                         buffer_FragColorbgra8[x*4+3] = d[3];
3918                 }
3919         }
3920         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3921         {
3922                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3923                 for (x = startx;x < endx;x++)
3924                 {
3925                         z = buffer_z[x];
3926                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3927                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3928                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3929                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3930                         if (attenuation < 0.01f)
3931                                 continue;
3932                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3933                         {
3934                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3935                                 if (attenuation < 0.01f)
3936                                         continue;
3937                         }
3938
3939                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3940                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3941                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3942                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3943                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3944                         {
3945                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3946                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3947                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3948                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3949                         }
3950                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3951                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3952                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3953                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3954
3955                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3956                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3957                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3958                         DPSOFTRAST_Vector3Normalize(lightnormal);
3959
3960                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3961                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3962                         {
3963                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3964                                 attenuation *= (1.0f / 255.0f);
3965                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3966                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3967                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3968                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
3969                         }
3970                         else
3971                         {
3972                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3973                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3974                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3975                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3976                         }
3977                         buffer_FragColorbgra8[x*4+0] = d[0];
3978                         buffer_FragColorbgra8[x*4+1] = d[1];
3979                         buffer_FragColorbgra8[x*4+2] = d[2];
3980                         buffer_FragColorbgra8[x*4+3] = d[3];
3981                 }
3982         }
3983         else
3984         {
3985                 for (x = startx;x < endx;x++)
3986                 {
3987                         z = buffer_z[x];
3988                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3989                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3990                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3991                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3992                         if (attenuation < 0.01f)
3993                                 continue;
3994                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3995                         {
3996                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3997                                 if (attenuation < 0.01f)
3998                                         continue;
3999                         }
4000
4001                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4002                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4003                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4004                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4005                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4006                         {
4007                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4008                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4009                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4010                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4011                         }
4012                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4013                         {
4014                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4015                                 attenuation *= (1.0f / 255.0f);
4016                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4017                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4018                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4019                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4020                         }
4021                         else
4022                         {
4023                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4024                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4025                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4026                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4027                         }
4028                         buffer_FragColorbgra8[x*4+0] = d[0];
4029                         buffer_FragColorbgra8[x*4+1] = d[1];
4030                         buffer_FragColorbgra8[x*4+2] = d[2];
4031                         buffer_FragColorbgra8[x*4+3] = d[3];
4032                 }
4033         }
4034         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4035 #endif
4036 }
4037
4038
4039
4040 void DPSOFTRAST_VertexShader_Refraction(void)
4041 {
4042         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4043 }
4044
4045 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4046 {
4047         // TODO: IMPLEMENT
4048         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4049         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4050         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4051         memset(buffer_FragColorbgra8, 0, span->length*4);
4052         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4053 }
4054
4055
4056
4057 void DPSOFTRAST_VertexShader_Water(void)
4058 {
4059         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4060 }
4061
4062
4063 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4064 {
4065         // TODO: IMPLEMENT
4066         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4067         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4068         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4069         memset(buffer_FragColorbgra8, 0, span->length*4);
4070         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4071 }
4072
4073
4074
4075 void DPSOFTRAST_VertexShader_ShowDepth(void)
4076 {
4077         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4078 }
4079
4080 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4081 {
4082         // TODO: IMPLEMENT
4083         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4084         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4085         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4086         memset(buffer_FragColorbgra8, 0, span->length*4);
4087         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4088 }
4089
4090
4091
4092 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4093 {
4094         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4095 }
4096
4097 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4098 {
4099         // TODO: IMPLEMENT
4100         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4101         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4102         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4103         memset(buffer_FragColorbgra8, 0, span->length*4);
4104         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4105 }
4106
4107
4108
4109 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4110 {
4111         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4112 }
4113
4114 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4115 {
4116         // TODO: IMPLEMENT
4117         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4118         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4119         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4120         memset(buffer_FragColorbgra8, 0, span->length*4);
4121         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4122 }
4123
4124
4125
4126 typedef struct DPSOFTRAST_ShaderModeInfo_s
4127 {
4128         int lodarrayindex;
4129         void (*Vertex)(void);
4130         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4131         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4132         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4133 }
4134 DPSOFTRAST_ShaderModeInfo;
4135
4136 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4137 {
4138         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4139         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4140         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4141         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4142         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4143         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4144         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4145         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4146         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4147         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4148         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4149         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4150         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4151         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4152         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4153         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
4154 };
4155
4156 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4157 {
4158         int i;
4159         int x;
4160         int startx;
4161         int endx;
4162 //      unsigned int c;
4163 //      unsigned int *colorpixel;
4164         unsigned int *depthpixel;
4165         float w;
4166         float wslope;
4167         int depth;
4168         int depthslope;
4169         unsigned int d;
4170         DPSOFTRAST_State_Triangle *triangle;
4171         DPSOFTRAST_State_Span *span;
4172         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4173         for (i = 0; i < thread->numspans; i++)
4174         {
4175                 span = &thread->spans[i];
4176                 triangle = &thread->triangles[span->triangle];
4177                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4178                 {
4179                         wslope = triangle->w[0];
4180                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4181                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4182                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4183                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4184                         switch(thread->fb_depthfunc)
4185                         {
4186                         default:
4187                         case GL_ALWAYS:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = true; break;
4188                         case GL_LESS:    for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4189                         case GL_LEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4190                         case GL_EQUAL:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4191                         case GL_GEQUAL:  for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4192                         case GL_GREATER: for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4193                         case GL_NEVER:   for (x = 0, d = depth;x < span->length;x++, d += depthslope) pixelmask[x] = false; break;
4194                         }
4195                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4196                         //for (x = 0;x < span->length;x++)
4197                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4198                         // if there is no color buffer, skip pixel shader
4199                         startx = 0;
4200                         endx = span->length;
4201                         while (startx < endx && !pixelmask[startx])
4202                                 startx++;
4203                         while (endx > startx && !pixelmask[endx-1])
4204                                 endx--;
4205                         if (startx >= endx)
4206                                 continue; // no pixels to fill
4207                         span->pixelmask = pixelmask;
4208                         span->startx = startx;
4209                         span->endx = endx;
4210                         // run pixel shader if appropriate
4211                         // do this before running depthmask code, to allow the pixelshader
4212                         // to clear pixelmask values for alpha testing
4213                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4214                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4215                         if (thread->depthmask)
4216                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4217                                         if (pixelmask[x])
4218                                                 depthpixel[x] = d;
4219                 }
4220                 else
4221                 {
4222                         // no depth testing means we're just dealing with color...
4223                         // if there is no color buffer, skip pixel shader
4224                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4225                         {
4226                                 memset(pixelmask, 1, span->length);
4227                                 span->pixelmask = pixelmask;
4228                                 span->startx = 0;
4229                                 span->endx = span->length;
4230                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4231                         }
4232                 }
4233         }
4234         thread->numspans = 0;
4235 }
4236
4237 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4238
4239 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4240 {
4241 #ifdef SSE2_PRESENT
4242         int cullface = thread->cullface;
4243         int width = dpsoftrast.fb_width;
4244         int miny = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4245         int maxy = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4246         __m128i fbmin, fbmax;
4247         __m128 viewportcenter, viewportscale;
4248         int firstvertex = command->firstvertex;
4249         int numvertices = command->numvertices;
4250         int numtriangles = command->numtriangles;
4251         const int *element3i = command->element3i;
4252         const unsigned short *element3s = command->element3s;
4253         int clipped = command->clipped;
4254         int i;
4255         int j;
4256         int k;
4257         int y;
4258         int e[3];
4259         __m128i screeny;
4260         int starty, endy;
4261         int numpoints;
4262         int clipcase;
4263         float clipdist[4];
4264         __m128 triangleedge1, triangleedge2, trianglenormal;
4265         __m128 clipfrac[3];
4266         __m128 screen[4];
4267         DPSOFTRAST_State_Triangle *triangle;
4268         DPSOFTRAST_Texture *texture;
4269         if (command->starty >= maxy || command->endy <= miny)
4270         {
4271                 if (!ATOMIC_DECREMENT(command->refcount))
4272                 {
4273                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4274                                 MM_FREE(command->arrays);
4275                 }
4276                 return;
4277         }
4278         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4279         fbmin = _mm_setr_epi16(0, miny, 0, miny, 0, miny, 0, miny);
4280         fbmax = _mm_sub_epi16(_mm_setr_epi16(width, maxy, width, maxy, width, maxy, width, maxy), _mm_set1_epi16(1));
4281         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4282         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4283         screen[3] = _mm_setzero_ps();
4284         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4285         for (i = 0;i < numtriangles;i++)
4286         {
4287                 const float *screencoord4f = command->arrays;
4288                 const float *arrays = screencoord4f + numvertices*4;
4289
4290                 // generate the 3 edges of this triangle
4291                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4292                 if (element3i)
4293                 {
4294                         e[0] = element3i[i*3+0] - firstvertex;
4295                         e[1] = element3i[i*3+1] - firstvertex;
4296                         e[2] = element3i[i*3+2] - firstvertex;
4297                 }
4298                 else if (element3s)
4299                 {
4300                         e[0] = element3s[i*3+0] - firstvertex;
4301                         e[1] = element3s[i*3+1] - firstvertex;
4302                         e[2] = element3s[i*3+2] - firstvertex;
4303                 }
4304                 else
4305                 {
4306                         e[0] = i*3+0;
4307                         e[1] = i*3+1;
4308                         e[2] = i*3+2;
4309                 }
4310
4311 #define SKIPBACKFACE \
4312                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4313                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4314                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4315                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4316                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4317                 switch(cullface) \
4318                 { \
4319                 case GL_BACK: \
4320                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4321                                 continue; \
4322                         break; \
4323                 case GL_FRONT: \
4324                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4325                                 continue; \
4326                         break; \
4327                 }
4328
4329 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4330                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4331                         { \
4332                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4333                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4334                         }
4335 #define CLIPPEDVERTEXCOPY(k,p1) \
4336                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4337
4338 #define GENATTRIBCOPY(attrib, p1) \
4339                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4340 #define GENATTRIBLERP(attrib, p1, p2) \
4341                 { \
4342                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4343                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4344                 }
4345 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4346                 switch(clipcase) \
4347                 { \
4348                 default: \
4349                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4350                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4351                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4352                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4353                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4354                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4355                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4356                 }
4357
4358                 if (! clipped)
4359                         goto notclipped;
4360
4361                 // calculate distance from nearplane
4362                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4363                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4364                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4365                 if (clipdist[0] >= 0.0f)
4366                 {
4367                         if (clipdist[1] >= 0.0f)
4368                         {
4369                                 if (clipdist[2] >= 0.0f)
4370                                 {
4371                                 notclipped:
4372                                         // triangle is entirely in front of nearplane
4373                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4374                                         SKIPBACKFACE;
4375                                         numpoints = 3;
4376                                         clipcase = 0;
4377                                 }
4378                                 else
4379                                 {
4380                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4381                                         SKIPBACKFACE;
4382                                         numpoints = 4;
4383                                         clipcase = 1;
4384                                 }
4385                         }
4386                         else
4387                         {
4388                                 if (clipdist[2] >= 0.0f)
4389                                 {
4390                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4391                                         SKIPBACKFACE;
4392                                         numpoints = 4;
4393                                         clipcase = 2;
4394                                 }
4395                                 else
4396                                 {
4397                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4398                                         SKIPBACKFACE;
4399                                         numpoints = 3;
4400                                         clipcase = 3;
4401                                 }
4402                         }
4403                 }
4404                 else if (clipdist[1] >= 0.0f)
4405                 {
4406                         if (clipdist[2] >= 0.0f)
4407                         {
4408                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4409                                 SKIPBACKFACE;
4410                                 numpoints = 4;
4411                                 clipcase = 4;
4412                         }
4413                         else
4414                         {
4415                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4416                                 SKIPBACKFACE;
4417                                 numpoints = 3;
4418                                 clipcase = 5;
4419                         }
4420                 }
4421                 else if (clipdist[2] >= 0.0f)
4422                 {
4423                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4424                         SKIPBACKFACE;
4425                         numpoints = 3;
4426                         clipcase = 6;
4427                 }
4428                 else continue; // triangle is entirely behind nearplane
4429
4430                 {
4431                         // calculate integer y coords for triangle points
4432                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4433                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4434                                         screenmin = _mm_min_epi16(screeni, screenir),
4435                                         screenmax = _mm_max_epi16(screeni, screenir);
4436                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4437                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4438                         screenmin = _mm_max_epi16(screenmin, fbmin);
4439                         screenmax = _mm_min_epi16(screenmax, fbmax);
4440                         // skip offscreen triangles
4441                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4442                                 continue;
4443                         starty = _mm_extract_epi16(screenmin, 1);
4444                         endy = _mm_extract_epi16(screenmax, 1)+1;
4445                         screeny = _mm_srai_epi32(screeni, 16);
4446                 }
4447
4448                 triangle = &thread->triangles[thread->numtriangles];
4449
4450                 // calculate attribute plans for triangle data...
4451                 // okay, this triangle is going to produce spans, we'd better project
4452                 // the interpolants now (this is what gives perspective texturing),
4453                 // this consists of simply multiplying all arrays by the W coord
4454                 // (which is basically 1/Z), which will be undone per-pixel
4455                 // (multiplying by Z again) to get the perspective-correct array
4456                 // values
4457                 {
4458                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4459                         __m128 mipedgescale, mipdensity;
4460                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4461                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4462                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4463                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4464                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4465                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4466                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4467                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4468                         attribedge1 = _mm_sub_ss(w0, w1);
4469                         attribedge2 = _mm_sub_ss(w2, w1);
4470                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4471                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4472                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4473                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4474                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4475                         _mm_store_ss(&triangle->w[0], attribxslope);
4476                         _mm_store_ss(&triangle->w[1], attribyslope);
4477                         _mm_store_ss(&triangle->w[2], attriborigin);
4478                         mipedgescale = _mm_setzero_ps();
4479                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4480                         {
4481                                 __m128 attrib0, attrib1, attrib2;
4482                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4483                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4484                                         break;
4485                                 arrays += numvertices*4;
4486                                 GENATTRIBS(attrib0, attrib1, attrib2);
4487                                 attriborigin = _mm_mul_ps(attrib1, w1);
4488                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4489                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4490                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4491                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4492                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4493                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4494                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4495                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4496                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4497                                 {
4498                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4499                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4500                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4501                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4502                                 }
4503                         }
4504
4505                         memset(triangle->mip, 0, sizeof(triangle->mip));
4506                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4507                         {
4508                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4509                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4510                                         break;
4511                                 texture = thread->texbound[texunit];
4512                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4513                                 {
4514                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4515                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4516                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4517                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4518                                         // this will be multiplied in the texturing routine by the texture resolution
4519                                         y = _mm_cvtss_si32(mipdensity);
4520                                         if (y > 0)
4521                                         {
4522                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4523                                                 if (y > texture->mipmaps - 1)
4524                                                         y = texture->mipmaps - 1;
4525                                                 triangle->mip[texunit] = y;
4526                                         }
4527                                 }
4528                         }
4529                 }
4530
4531                 for (y = starty; y < endy;)
4532                 {
4533                         __m128 xcoords, xslope;
4534                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4535                         int yccmask = _mm_movemask_epi8(ycc);
4536                         int edge0p, edge0n, edge1p, edge1n;
4537                         int nexty;
4538                         if (numpoints == 4)
4539                         {
4540                                 switch(yccmask)
4541                                 {
4542                                 default:
4543                                 case 0xFFFF: /*0000*/ y = endy; continue;
4544                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4545                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4546                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4547                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4548                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4549                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4550                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4551                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4552                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4553                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4554                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4555                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4556                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4557                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4558                                 case 0x0000: /*1111*/ y++; continue;
4559                                 }
4560                         }
4561                         else
4562                         {
4563                                 switch(yccmask)
4564                                 {
4565                                 default:
4566                                 case 0xFFFF: /*000*/ y = endy; continue;
4567                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4568                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4569                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4570                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4571                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4572                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4573                                 case 0x0000: /*111*/ y++; continue;
4574                                 }
4575                         }
4576                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4577                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4578                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4579                         nexty = _mm_extract_epi16(ycc, 0);
4580                         if(nexty >= endy) nexty = endy-1;
4581                         if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4582                         {
4583                                 int tmp = edge0n;
4584                                 edge0n = edge1n;
4585                                 edge1n = tmp;
4586                                 tmp = edge0p;
4587                                 edge0p = edge1p;
4588                                 edge1p = tmp;
4589                         }
4590                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4591                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4592                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4593                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4594                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4595                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4596                         {
4597                                 int startx, endx, offset;
4598                                 startx = _mm_cvtss_si32(xcoords);
4599                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4600                                 if (startx < 0) startx = 0;
4601                                 if (endx > dpsoftrast.fb_width) endx = dpsoftrast.fb_width;
4602                                 if (startx >= endx) continue;
4603                                 for (offset = startx; offset < endx;)
4604                                 {
4605                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4606                                         span->triangle = thread->numtriangles;
4607                                         span->x = offset;
4608                                         span->y = y;
4609                                         span->length = endx - offset;
4610                                         if (span -> length > DPSOFTRAST_DRAW_MAXSPANLENGTH)
4611                                                 span -> length = DPSOFTRAST_DRAW_MAXSPANLENGTH;
4612                                         offset += span->length;
4613                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4614                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4615                                 }
4616                         }
4617                 }
4618
4619                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4620                 {
4621                         DPSOFTRAST_Draw_ProcessSpans(thread);
4622                         thread->numtriangles = 0;
4623                 }
4624         }
4625
4626         if (!ATOMIC_DECREMENT(command->refcount))
4627         {
4628                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4629                         MM_FREE(command->arrays);
4630         }
4631
4632         if (thread->numspans > 0 || thread->numtriangles > 0)
4633         {
4634                 DPSOFTRAST_Draw_ProcessSpans(thread);
4635                 thread->numtriangles = 0;
4636         }
4637 #endif
4638 }
4639
4640 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4641 {
4642         int i;
4643         int j;
4644         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4645         int datasize = 2*numvertices*sizeof(float[4]);
4646         DPSOFTRAST_Command_Draw *command;
4647         unsigned char *data;
4648         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4649         {
4650                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4651                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4652                         break;
4653                 datasize += numvertices*sizeof(float[4]);
4654         }
4655         if (element3i)
4656                 datasize += numtriangles*sizeof(int[3]);
4657         else if (element3s)
4658                 datasize += numtriangles*sizeof(unsigned short[3]);
4659         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4660         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4661         {
4662                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4663                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4664         }
4665         else
4666         {
4667                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4668                 data = (unsigned char *)command + commandsize;
4669         }
4670         command->firstvertex = firstvertex;
4671         command->numvertices = numvertices;
4672         command->numtriangles = numtriangles;
4673         command->arrays = (float *)data;
4674         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4675         dpsoftrast.firstvertex = firstvertex;
4676         dpsoftrast.numvertices = numvertices;
4677         dpsoftrast.screencoord4f = (float *)data;
4678         data += numvertices*sizeof(float[4]);
4679         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4680         data += numvertices*sizeof(float[4]);
4681         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4682         {
4683                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4684                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4685                         break;
4686                 dpsoftrast.post_array4f[j] = (float *)data;
4687                 data += numvertices*sizeof(float[4]);
4688         }
4689         command->element3i = NULL;
4690         command->element3s = NULL;
4691         if (element3i)
4692         {
4693                 command->element3i = (int *)data;
4694                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4695         }
4696         else if (element3s)
4697         {
4698                 command->element3s = (unsigned short *)data;
4699                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4700         }
4701         return command;
4702 }
4703
4704 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4705 {
4706         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4707         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4708         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4709         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4710         if (command->starty >= command->endy)
4711         {
4712                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4713                         MM_FREE(command->arrays);
4714                 DPSOFTRAST_UndoCommand(command->commandsize);
4715                 return;
4716         }
4717         command->clipped = dpsoftrast.drawclipped;
4718         command->refcount = dpsoftrast.numthreads;
4719
4720 #ifdef USE_THREADS
4721         DPSOFTRAST_Draw_SyncCommands();
4722         {
4723                 int i;
4724                 int nexty = 0;
4725                 for (i = 0; i < dpsoftrast.numthreads; i++)
4726                 {
4727                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4728                         int y = nexty;
4729                         nexty = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
4730                         if (command->starty < nexty && command->endy > y && thread->starving)
4731                                 SDL_CondSignal(thread->drawcond);
4732                 }
4733         }
4734 #else
4735         DPSOFTRAST_Draw_FlushThreads();
4736 #endif
4737 }
4738  
4739 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4740 {
4741         int commandoffset = thread->commandoffset;
4742         while (commandoffset != endoffset)
4743         {
4744                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4745                 switch (command->opcode)
4746                 {
4747 #define INTERPCOMMAND(name) \
4748                 case DPSOFTRAST_OPCODE_##name : \
4749                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4750                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4751                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4752                                 commandoffset = 0; \
4753                         break;
4754                 INTERPCOMMAND(Viewport)
4755                 INTERPCOMMAND(ClearColor)
4756                 INTERPCOMMAND(ClearDepth)
4757                 INTERPCOMMAND(ColorMask)
4758                 INTERPCOMMAND(DepthTest)
4759                 INTERPCOMMAND(ScissorTest)
4760                 INTERPCOMMAND(Scissor)
4761                 INTERPCOMMAND(BlendFunc)
4762                 INTERPCOMMAND(BlendSubtract)
4763                 INTERPCOMMAND(DepthMask)
4764                 INTERPCOMMAND(DepthFunc)
4765                 INTERPCOMMAND(DepthRange)
4766                 INTERPCOMMAND(PolygonOffset)
4767                 INTERPCOMMAND(CullFace)
4768                 INTERPCOMMAND(AlphaTest)
4769                 INTERPCOMMAND(AlphaFunc)
4770                 INTERPCOMMAND(SetTexture)
4771                 INTERPCOMMAND(SetShader)
4772                 INTERPCOMMAND(Uniform4f)
4773                 INTERPCOMMAND(UniformMatrix4f)
4774                 INTERPCOMMAND(Uniform1i)
4775
4776                 case DPSOFTRAST_OPCODE_Draw:
4777                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4778                         commandoffset += command->commandsize;
4779                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4780                                 commandoffset = 0;
4781                         thread->commandoffset = commandoffset;
4782                         break;
4783
4784                 case DPSOFTRAST_OPCODE_Reset:
4785                         commandoffset = 0;
4786                         break;
4787                 }
4788         }
4789         thread->commandoffset = commandoffset;
4790 }
4791
4792 #ifdef USE_THREADS
4793 static int DPSOFTRAST_Draw_Thread(void *data)
4794 {
4795         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4796         while(thread->index >= 0)
4797         {
4798                 if (thread->commandoffset != dpsoftrast.drawcommand)
4799                 {
4800                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
4801                 }
4802                 else 
4803                 {
4804                         SDL_LockMutex(thread->drawmutex);
4805                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4806                         {
4807                                 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4808                                 thread->starving = true;
4809                                 SDL_CondWait(thread->drawcond, thread->drawmutex);
4810                                 thread->starving = false;
4811                         }
4812                         SDL_UnlockMutex(thread->drawmutex);
4813                 }
4814         }   
4815         return 0;
4816 }
4817 #endif
4818
4819 static void DPSOFTRAST_Draw_FlushThreads(void)
4820 {
4821         DPSOFTRAST_State_Thread *thread;
4822         int i;
4823         DPSOFTRAST_Draw_SyncCommands();
4824 #ifdef USE_THREADS
4825         for (i = 0; i < dpsoftrast.numthreads; i++)
4826         {
4827                 thread = &dpsoftrast.threads[i];
4828                 if (thread->commandoffset != dpsoftrast.drawcommand)
4829                 {
4830                         SDL_LockMutex(thread->drawmutex);
4831                         if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4832                                 SDL_CondSignal(thread->drawcond);
4833                         SDL_UnlockMutex(thread->drawmutex);
4834                 }
4835         }
4836 #endif                  
4837         for (i = 0; i < dpsoftrast.numthreads; i++)
4838         {
4839                 thread = &dpsoftrast.threads[i];
4840 #ifdef USE_THREADS
4841                 if (thread->commandoffset != dpsoftrast.drawcommand)
4842                 {
4843                         SDL_LockMutex(thread->drawmutex);
4844                         if (thread->commandoffset != dpsoftrast.drawcommand)
4845                         {
4846                                 thread->waiting = true;
4847                                 SDL_CondWait(thread->waitcond, thread->drawmutex);
4848                                 thread->waiting = false;
4849                         }
4850                         SDL_UnlockMutex(thread->drawmutex);
4851                 }
4852 #else
4853                 if (thread->commandoffset != dpsoftrast.drawcommand)
4854                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4855 #endif
4856         }
4857         dpsoftrast.commandpool.usedcommands = 0;
4858 }
4859
4860 void DPSOFTRAST_Flush(void)
4861 {
4862         DPSOFTRAST_Draw_FlushThreads();
4863 }
4864
4865 void DPSOFTRAST_Finish(void)
4866 {
4867         DPSOFTRAST_Flush();
4868 }
4869
4870 void DPSOFTRAST_Init(int width, int height, int numthreads, unsigned int *colorpixels, unsigned int *depthpixels)
4871 {
4872         int i;
4873         union
4874         {
4875                 int i;
4876                 unsigned char b[4];
4877         }
4878         u;
4879         u.i = 1;
4880         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4881         dpsoftrast.bigendian = u.b[3];
4882         dpsoftrast.fb_width = width;
4883         dpsoftrast.fb_height = height;
4884         dpsoftrast.fb_depthpixels = depthpixels;
4885         dpsoftrast.fb_colorpixels[0] = colorpixels;
4886         dpsoftrast.fb_colorpixels[1] = NULL;
4887         dpsoftrast.fb_colorpixels[1] = NULL;
4888         dpsoftrast.fb_colorpixels[1] = NULL;
4889         dpsoftrast.viewport[0] = 0;
4890         dpsoftrast.viewport[1] = 0;
4891         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4892         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4893         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4894         dpsoftrast.texture_firstfree = 1;
4895         dpsoftrast.texture_end = 1;
4896         dpsoftrast.texture_max = 0;
4897         dpsoftrast.color[0] = 1;
4898         dpsoftrast.color[1] = 1;
4899         dpsoftrast.color[2] = 1;
4900         dpsoftrast.color[3] = 1;
4901 #ifdef USE_THREADS
4902         dpsoftrast.numthreads = bound(1, numthreads, 64);
4903 #else
4904         dpsoftrast.numthreads = 1;
4905 #endif
4906         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4907         for (i = 0; i < dpsoftrast.numthreads; i++)
4908         {
4909                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4910                 thread->index = i;
4911                 thread->cullface = GL_BACK;
4912                 thread->colormask[1] = 1;
4913                 thread->colormask[2] = 1;
4914                 thread->colormask[3] = 1;
4915                 thread->blendfunc[0] = GL_ONE;
4916                 thread->blendfunc[1] = GL_ZERO;
4917                 thread->depthmask = true;
4918                 thread->depthtest = true;
4919                 thread->depthfunc = GL_LEQUAL;
4920                 thread->scissortest = false;
4921                 thread->alphatest = false;
4922                 thread->alphafunc = GL_GREATER;
4923                 thread->alphavalue = 0.5f;
4924                 thread->viewport[0] = 0;
4925                 thread->viewport[1] = 0;
4926                 thread->viewport[2] = dpsoftrast.fb_width;
4927                 thread->viewport[3] = dpsoftrast.fb_height;
4928                 thread->scissor[0] = 0;
4929                 thread->scissor[1] = 0;
4930                 thread->scissor[2] = dpsoftrast.fb_width;
4931                 thread->scissor[3] = dpsoftrast.fb_height;
4932                 thread->depthrange[0] = 0;
4933                 thread->depthrange[1] = 1;
4934                 thread->polygonoffset[0] = 0;
4935                 thread->polygonoffset[1] = 0;
4936
4937                 thread->numspans = 0;
4938                 thread->numtriangles = 0;
4939                 thread->commandoffset = 0;
4940                 thread->waiting = false;
4941                 thread->starving = false;
4942 #ifdef USE_THREADS
4943                 thread->waitcond = SDL_CreateCond();
4944                 thread->drawcond = SDL_CreateCond();
4945                 thread->drawmutex = SDL_CreateMutex();
4946 #endif
4947
4948                 thread->validate = -1;
4949                 DPSOFTRAST_Validate(thread, -1);
4950 #ifdef USE_THREADS
4951                 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
4952 #endif
4953         }
4954 }
4955
4956 void DPSOFTRAST_Shutdown(void)
4957 {
4958         int i;
4959 #ifdef USE_THREADS
4960         if(dpsoftrast.numthreads > 0)
4961         {
4962                 DPSOFTRAST_State_Thread *thread;
4963                 for (i = 0; i < dpsoftrast.numthreads; i++)
4964                 {
4965                         thread = &dpsoftrast.threads[i];
4966                         SDL_LockMutex(thread->drawmutex);
4967                         thread->index = -1;
4968                         SDL_CondSignal(thread->drawcond);
4969                         SDL_UnlockMutex(thread->drawmutex);
4970                         SDL_WaitThread(thread->thread, NULL);
4971                         SDL_DestroyCond(thread->waitcond);
4972                         SDL_DestroyCond(thread->drawcond);
4973                         SDL_DestroyMutex(thread->drawmutex);
4974                 }
4975         }
4976 #endif
4977         for (i = 0;i < dpsoftrast.texture_end;i++)
4978                 if (dpsoftrast.texture[i].bytes)
4979                         MM_FREE(dpsoftrast.texture[i].bytes);
4980         if (dpsoftrast.texture)
4981                 free(dpsoftrast.texture);
4982         if (dpsoftrast.threads)
4983                 MM_FREE(dpsoftrast.threads);
4984         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4985 }
4986