]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
implemented scissoring of triangles and extra necessary blendmode for xonotic hud
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "dpsoftrast.h"
7
8 #ifdef USE_SDL
9 #define USE_THREADS
10 #endif
11
12 #ifndef __cplusplus
13 typedef qboolean bool;
14 #endif
15
16 #define ALIGN_SIZE 16
17 #define ATOMIC_SIZE 32
18
19 #ifdef SSE2_PRESENT
20         #if defined(__GNUC__)
21                 #define ALIGN(var) var __attribute__((__aligned__(16)))
22                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
23                 #ifdef USE_THREADS
24                         #define MEMORY_BARRIER (_mm_sfence())
25                         //(__sync_synchronize())
26                         #define ATOMIC_COUNTER volatile int
27                         #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
28                         #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
29                         #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
30                 #endif
31         #elif defined(_MSC_VER)
32                 #define ALIGN(var) __declspec(align(16)) var
33                 #define ATOMIC(var) __declspec(align(32)) var
34                 #ifdef USE_THREADS
35                         #define MEMORY_BARRIER (_mm_sfence())
36                         //(MemoryBarrier())
37                         #define ATOMIC_COUNTER volatile LONG
38                         #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
39                         #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
40                         #define ATOMIC_ADD(counter, val) (InterlockedExchangeAdd(&(counter), (val)))
41                 #endif
42         #else
43                 #undef USE_THREADS
44                 #undef SSE2_PRESENT
45         #endif
46 #endif
47
48 #ifndef SSE2_PRESENT
49         #define ALIGN(var) var
50         #define ATOMIC(var) var
51 #endif
52
53 #ifdef USE_THREADS
54 #include <SDL.h>
55 #include <SDL_thread.h>
56 #else
57         #define MEMORY_BARRIER ((void)0)
58         #define ATOMIC_COUNTER int
59         #define ATOMIC_INCREMENT(counter) (++(counter))
60         #define ATOMIC_DECREMENT(counter) (--(counter))
61         #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
62         typedef void SDL_Thread;
63         typedef void SDL_cond;
64         typedef void SDL_mutex;
65 #endif
66
67 #ifdef SSE2_PRESENT
68 #include <emmintrin.h>
69
70 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
71
72 static void *MM_CALLOC(size_t nmemb, size_t size)
73 {
74         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
75         if (ptr != NULL) memset(ptr, 0, nmemb*size);
76         return ptr;
77 }
78
79 #define MM_FREE _mm_free
80 #else
81 #define MM_MALLOC(size) malloc(size)
82 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
83 #define MM_FREE free
84 #endif
85
86 typedef enum DPSOFTRAST_ARRAY_e
87 {
88         DPSOFTRAST_ARRAY_POSITION,
89         DPSOFTRAST_ARRAY_COLOR,
90         DPSOFTRAST_ARRAY_TEXCOORD0,
91         DPSOFTRAST_ARRAY_TEXCOORD1,
92         DPSOFTRAST_ARRAY_TEXCOORD2,
93         DPSOFTRAST_ARRAY_TEXCOORD3,
94         DPSOFTRAST_ARRAY_TEXCOORD4,
95         DPSOFTRAST_ARRAY_TEXCOORD5,
96         DPSOFTRAST_ARRAY_TEXCOORD6,
97         DPSOFTRAST_ARRAY_TEXCOORD7,
98         DPSOFTRAST_ARRAY_TOTAL
99 }
100 DPSOFTRAST_ARRAY;
101
102 typedef struct DPSOFTRAST_Texture_s
103 {
104         int flags;
105         int width;
106         int height;
107         int depth;
108         int sides;
109         DPSOFTRAST_TEXTURE_FILTER filter;
110         int mipmaps;
111         int size;
112         ATOMIC_COUNTER binds;
113         unsigned char *bytes;
114         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
115 }
116 DPSOFTRAST_Texture;
117
118 #define COMMAND_SIZE ALIGN_SIZE
119 #define COMMAND_ALIGN(var) ALIGN(var)
120
121 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
122 {
123         unsigned char opcode;
124         unsigned short commandsize;
125 }
126 DPSOFTRAST_Command);
127
128 enum { DPSOFTRAST_OPCODE_Reset = 0 };
129
130 #define DEFCOMMAND(opcodeval, name, fields) \
131         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
132         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
133         { \
134                 unsigned char opcode; \
135                 unsigned short commandsize; \
136                 fields \
137         } DPSOFTRAST_Command_##name );
138
139 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
140 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
141
142 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
143 {
144         int freecommand;
145         int usedcommands;
146         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
147 }
148 DPSOFTRAST_State_Command_Pool);
149
150 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
151 {
152         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
153         float w[3];
154         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
155 }
156 DPSOFTRAST_State_Triangle);
157
158 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
159         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
160         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
161                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
162                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
163 }
164 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
165         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
166         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
167         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
168         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
169         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
170         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
171         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
172         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
173 }
174                                         
175 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
176
177 typedef ALIGN(struct DPSOFTRAST_State_Span_s
178 {
179         int triangle; // triangle this span was generated by
180         int x; // framebuffer x coord
181         int y; // framebuffer y coord
182         int startx; // usable range (according to pixelmask)
183         int endx; // usable range (according to pixelmask)
184         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
185 }
186 DPSOFTRAST_State_Span);
187
188 #define DPSOFTRAST_DRAW_MAXSPANS 1024
189 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
190
191 #define DPSOFTRAST_VALIDATE_FB 1
192 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
193 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
194 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
195
196 typedef enum DPSOFTRAST_BLENDMODE_e
197 {
198         DPSOFTRAST_BLENDMODE_OPAQUE,
199         DPSOFTRAST_BLENDMODE_ALPHA,
200         DPSOFTRAST_BLENDMODE_ADDALPHA,
201         DPSOFTRAST_BLENDMODE_ADD,
202         DPSOFTRAST_BLENDMODE_INVMOD,
203         DPSOFTRAST_BLENDMODE_MUL,
204         DPSOFTRAST_BLENDMODE_MUL2,
205         DPSOFTRAST_BLENDMODE_SUBALPHA,
206         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
207         DPSOFTRAST_BLENDMODE_INVADD,
208         DPSOFTRAST_BLENDMODE_TOTAL
209 }
210 DPSOFTRAST_BLENDMODE;
211
212 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
213 {
214         SDL_Thread *thread;
215         int index;
216         
217         int cullface;
218         int colormask[4];
219         int blendfunc[2];
220         int blendsubtract;
221         int depthmask;
222         int depthtest;
223         int depthfunc;
224         int scissortest;
225         int alphatest;
226         int alphafunc;
227         float alphavalue;
228         int viewport[4];
229         int scissor[4];
230         float depthrange[2];
231         float polygonoffset[2];
232
233         int shader_mode;
234         int shader_permutation;
235
236         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
237         
238         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
239         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
240
241         // DPSOFTRAST_VALIDATE_ flags
242         int validate;
243
244         // derived values (DPSOFTRAST_VALIDATE_FB)
245         int fb_colormask;
246         int fb_scissor[4];
247         ALIGN(float fb_viewportcenter[4]);
248         ALIGN(float fb_viewportscale[4]);
249
250         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
251         int fb_depthfunc;
252
253         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
254         int fb_blendmode;
255
256         // band boundaries
257         int miny1;
258         int maxy1;
259         int miny2;
260         int maxy2;
261
262         ATOMIC(volatile int commandoffset);
263
264         volatile bool waiting;
265         volatile bool starving;
266         SDL_cond *waitcond;
267         SDL_cond *drawcond;
268         SDL_mutex *drawmutex;
269
270         int numspans;
271         int numtriangles;
272         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
273         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
274 }
275 DPSOFTRAST_State_Thread);
276
277 typedef ATOMIC(struct DPSOFTRAST_State_s
278 {
279         int fb_width;
280         int fb_height;
281         unsigned int *fb_depthpixels;
282         unsigned int *fb_colorpixels[4];
283
284         int viewport[4];
285         ALIGN(float fb_viewportcenter[4]);
286         ALIGN(float fb_viewportscale[4]);
287
288         float color[4];
289         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
290         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
291
292         const float *pointer_vertex3f;
293         const float *pointer_color4f;
294         const unsigned char *pointer_color4ub;
295         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
296         int stride_vertex;
297         int stride_color;
298         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
299         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
300         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
301
302         int firstvertex;
303         int numvertices;
304         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
305         float *screencoord4f;
306         int drawstarty;
307         int drawendy;
308         int drawclipped;
309         
310         int shader_mode;
311         int shader_permutation;
312
313         int texture_max;
314         int texture_end;
315         int texture_firstfree;
316         DPSOFTRAST_Texture *texture;
317
318         int bigendian;
319
320         // error reporting
321         const char *errorstring;
322
323         int interlace;
324         int numthreads;
325         DPSOFTRAST_State_Thread *threads;
326
327         ATOMIC(volatile int drawcommand);
328
329         DPSOFTRAST_State_Command_Pool commandpool;
330 }
331 DPSOFTRAST_State);
332
333 DPSOFTRAST_State dpsoftrast;
334
335 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
336 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
337 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
338 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
339 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
340
341 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
342 {
343         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
344         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
345         fb_viewportcenter[3] = 0.5f;
346         fb_viewportcenter[0] = 0.0f;
347         fb_viewportscale[1] = 0.5f * viewport[2];
348         fb_viewportscale[2] = -0.5f * viewport[3];
349         fb_viewportscale[3] = 0.5f;
350         fb_viewportscale[0] = 1.0f;
351 }
352
353 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
354 {
355         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
356         // and viewport projection values
357         int x1, x2;
358         int y1, y2;
359         x1 = thread->scissor[0];
360         x2 = thread->scissor[0] + thread->scissor[2];
361         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
362         y2 = dpsoftrast.fb_height - thread->scissor[1];
363         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
364         if (x1 < 0) x1 = 0;
365         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
366         if (y1 < 0) y1 = 0;
367         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
368         thread->fb_scissor[0] = x1;
369         thread->fb_scissor[1] = y1;
370         thread->fb_scissor[2] = x2 - x1;
371         thread->fb_scissor[3] = y2 - y1;
372
373         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
374 }
375
376 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
377 {
378         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
379 }
380
381 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
382 {
383         if (thread->blendsubtract)
384         {
385                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
386                 {
387                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
388                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
389                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
390                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
391                 }
392         }
393         else
394         {       
395                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
396                 {
397                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
398                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
399                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
400                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
401                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
402                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
403                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
404                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
405                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
406                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
407                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
408                 }
409         }
410 }
411
412 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
413
414 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
415 {
416         mask &= thread->validate;
417         if (!mask)
418                 return;
419         if (mask & DPSOFTRAST_VALIDATE_FB)
420         {
421                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
422                 DPSOFTRAST_RecalcFB(thread);
423         }
424         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
425         {
426                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
427                 DPSOFTRAST_RecalcDepthFunc(thread);
428         }
429         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
430         {
431                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
432                 DPSOFTRAST_RecalcBlendFunc(thread);
433         }
434 }
435
436 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
437 {
438         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
439                 return &dpsoftrast.texture[index];
440         return NULL;
441 }
442
443 static void DPSOFTRAST_Texture_Grow(void)
444 {
445         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
446         DPSOFTRAST_State_Thread *thread;
447         int i;
448         int j;
449         DPSOFTRAST_Flush();
450         // expand texture array as needed
451         if (dpsoftrast.texture_max < 1024)
452                 dpsoftrast.texture_max = 1024;
453         else
454                 dpsoftrast.texture_max *= 2;
455         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
456         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
457                 if (dpsoftrast.texbound[i])
458                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
459         for (j = 0; j < dpsoftrast.numthreads; j++)
460         {
461                 thread = &dpsoftrast.threads[j];
462                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
463                         if (thread->texbound[i])
464                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
465         }
466 }
467
468 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
469 {
470         int w;
471         int h;
472         int d;
473         int size;
474         int s;
475         int texnum;
476         int mipmaps;
477         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
478         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
479         DPSOFTRAST_Texture *texture;
480         if (width*height*depth < 1)
481         {
482                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
483                 return 0;
484         }
485         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
486         {
487                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
488                 return 0;
489         }
490         switch(texformat)
491         {
492         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
493         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
494         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
495                 break;
496         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
497                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
498                 {
499                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
500                         return 0;
501                 }
502                 if (depth != 1)
503                 {
504                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
505                         return 0;
506                 }
507                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
508                 {
509                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
510                         return 0;
511                 }
512                 break;
513         }
514         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
515         {
516                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
517                 return 0;
518         }
519         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
520         {
521                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
522                 return 0;
523         }
524         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
525         {
526                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
527                 return 0;
528         }
529         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
530         {
531                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
532                 return 0;
533         }
534         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
535         {
536                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
537                 return 0;
538         }
539         // find first empty slot in texture array
540         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
541                 if (!dpsoftrast.texture[texnum].bytes)
542                         break;
543         dpsoftrast.texture_firstfree = texnum + 1;
544         if (dpsoftrast.texture_max <= texnum)
545                 DPSOFTRAST_Texture_Grow();
546         if (dpsoftrast.texture_end <= texnum)
547                 dpsoftrast.texture_end = texnum + 1;
548         texture = &dpsoftrast.texture[texnum];
549         memset(texture, 0, sizeof(*texture));
550         texture->flags = flags;
551         texture->width = width;
552         texture->height = height;
553         texture->depth = depth;
554         texture->sides = sides;
555         texture->binds = 0;
556         w = width;
557         h = height;
558         d = depth;
559         size = 0;
560         mipmaps = 0;
561         w = width;
562         h = height;
563         d = depth;
564         for (;;)
565         {
566                 s = w * h * d * sides * 4;
567                 texture->mipmap[mipmaps][0] = size;
568                 texture->mipmap[mipmaps][1] = s;
569                 texture->mipmap[mipmaps][2] = w;
570                 texture->mipmap[mipmaps][3] = h;
571                 texture->mipmap[mipmaps][4] = d;
572                 size += s;
573                 mipmaps++;
574                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
575                         break;
576                 if (w > 1) w >>= 1;
577                 if (h > 1) h >>= 1;
578                 if (d > 1) d >>= 1;
579         }
580         texture->mipmaps = mipmaps;
581         texture->size = size;
582
583         // allocate the pixels now
584         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
585
586         return texnum;
587 }
588 void DPSOFTRAST_Texture_Free(int index)
589 {
590         DPSOFTRAST_Texture *texture;
591         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
592         if (texture->binds)
593                 DPSOFTRAST_Flush();
594         if (texture->bytes)
595                 MM_FREE(texture->bytes);
596         texture->bytes = NULL;
597         memset(texture, 0, sizeof(*texture));
598         // adjust the free range and used range
599         if (dpsoftrast.texture_firstfree > index)
600                 dpsoftrast.texture_firstfree = index;
601         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
602                 dpsoftrast.texture_end--;
603 }
604 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
605 {
606         int i, x, y, z, w, layer0, layer1, row0, row1;
607         unsigned char *o, *i0, *i1, *i2, *i3;
608         DPSOFTRAST_Texture *texture;
609         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
610         if (texture->mipmaps <= 1)
611                 return;
612         for (i = 1;i < texture->mipmaps;i++)
613         {
614                 for (z = 0;z < texture->mipmap[i][4];z++)
615                 {
616                         layer0 = z*2;
617                         layer1 = z*2+1;
618                         if (layer1 >= texture->mipmap[i-1][4])
619                                 layer1 = texture->mipmap[i-1][4]-1;
620                         for (y = 0;y < texture->mipmap[i][3];y++)
621                         {
622                                 row0 = y*2;
623                                 row1 = y*2+1;
624                                 if (row1 >= texture->mipmap[i-1][3])
625                                         row1 = texture->mipmap[i-1][3]-1;
626                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
627                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
628                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
629                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
630                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
631                                 w = texture->mipmap[i][2];
632                                 if (layer1 > layer0)
633                                 {
634                                         if (texture->mipmap[i-1][2] > 1)
635                                         {
636                                                 // average 3D texture
637                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
638                                                 {
639                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
640                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
641                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
642                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
643                                                 }
644                                         }
645                                         else
646                                         {
647                                                 // average 3D mipmap with parent width == 1
648                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
649                                                 {
650                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
651                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
652                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
653                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
654                                                 }
655                                         }
656                                 }
657                                 else
658                                 {
659                                         if (texture->mipmap[i-1][2] > 1)
660                                         {
661                                                 // average 2D texture (common case)
662                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
663                                                 {
664                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
665                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
666                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
667                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
668                                                 }
669                                         }
670                                         else
671                                         {
672                                                 // 2D texture with parent width == 1
673                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
674                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
675                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
676                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
677                                         }
678                                 }
679                         }
680                 }
681         }
682 }
683 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
684 {
685         DPSOFTRAST_Texture *texture;
686         unsigned char *dst;
687         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
688         if (texture->binds)
689                 DPSOFTRAST_Flush();
690         dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
691         while (blockheight > 0)
692         {
693                 memcpy(dst, pixels, blockwidth * 4);
694                 pixels += blockwidth * 4;
695                 dst += texture->mipmap[0][2] * 4;
696                 blockheight--;
697         }
698         DPSOFTRAST_Texture_CalculateMipmaps(index);
699 }
700 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
701 {
702         DPSOFTRAST_Texture *texture;
703         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
704         if (texture->binds)
705                 DPSOFTRAST_Flush();
706         memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
707         DPSOFTRAST_Texture_CalculateMipmaps(index);
708 }
709 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
710 {
711         DPSOFTRAST_Texture *texture;
712         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
713         return texture->mipmap[mip][2];
714 }
715 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
716 {
717         DPSOFTRAST_Texture *texture;
718         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
719         return texture->mipmap[mip][3];
720 }
721 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
722 {
723         DPSOFTRAST_Texture *texture;
724         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
725         return texture->mipmap[mip][4];
726 }
727 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
728 {
729         DPSOFTRAST_Texture *texture;
730         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
731         if (texture->binds)
732                 DPSOFTRAST_Flush();
733         return texture->bytes + texture->mipmap[mip][0];
734 }
735 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
736 {
737         DPSOFTRAST_Texture *texture;
738         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
739         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
740         {
741                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
742                 return;
743         }
744         if (texture->binds)
745                 DPSOFTRAST_Flush();
746         texture->filter = filter;
747 }
748
749 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
750 {
751         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
752                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
753                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
754                 DPSOFTRAST_Flush();
755         dpsoftrast.fb_width = width;
756         dpsoftrast.fb_height = height;
757         dpsoftrast.fb_depthpixels = depthpixels;
758         dpsoftrast.fb_colorpixels[0] = colorpixels0;
759         dpsoftrast.fb_colorpixels[1] = colorpixels1;
760         dpsoftrast.fb_colorpixels[2] = colorpixels2;
761         dpsoftrast.fb_colorpixels[3] = colorpixels3;
762 }
763
764 static void DPSOFTRAST_Draw_FlushThreads(void);
765
766 static void DPSOFTRAST_Draw_SyncCommands(void)
767 {
768         MEMORY_BARRIER;
769         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
770 }
771
772 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
773 {
774 #ifdef USE_THREADS
775         DPSOFTRAST_State_Thread *thread;
776         int i;
777         int freecommand = dpsoftrast.commandpool.freecommand;
778         int usedcommands = dpsoftrast.commandpool.usedcommands;
779         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
780                 return;
781         DPSOFTRAST_Draw_SyncCommands();
782         for(;;)
783         {
784                 int waitindex = -1;
785                 int commandoffset;
786                 usedcommands = 0;
787                 for (i = 0; i < dpsoftrast.numthreads; i++)
788                 {
789                         thread = &dpsoftrast.threads[i]; 
790                         commandoffset = freecommand - thread->commandoffset;
791                         if (commandoffset < 0)
792                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
793                         if (commandoffset > usedcommands)
794                         {
795                                 waitindex = i;
796                                 usedcommands = commandoffset;
797                         }
798                 }
799                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
800                         break;
801                 thread = &dpsoftrast.threads[waitindex];
802                 SDL_LockMutex(thread->drawmutex);
803                 if (thread->commandoffset != dpsoftrast.drawcommand)
804                 {
805                         thread->waiting = true;
806                         if (thread->starving) SDL_CondSignal(thread->drawcond);
807                         SDL_CondWait(thread->waitcond, thread->drawmutex);
808                         thread->waiting = false;
809                 }
810                 SDL_UnlockMutex(thread->drawmutex);
811         }
812         dpsoftrast.commandpool.usedcommands = usedcommands;
813 #else
814         DPSOFTRAST_Draw_FlushThreads();
815 #endif
816 }
817
818 #define DPSOFTRAST_ALIGNCOMMAND(size) \
819         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
820 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
821         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
822
823 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
824 {
825         DPSOFTRAST_Command *command;
826         int freecommand = dpsoftrast.commandpool.freecommand;
827         int usedcommands = dpsoftrast.commandpool.usedcommands;
828         int extra = sizeof(DPSOFTRAST_Command);
829         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
830                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
831         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
832         {
833                 DPSOFTRAST_Draw_FreeCommandPool(size + extra);
834                 freecommand = dpsoftrast.commandpool.freecommand;
835                 usedcommands = dpsoftrast.commandpool.usedcommands;
836         }
837         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
838         {
839                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
840                 command->opcode = DPSOFTRAST_OPCODE_Reset;
841                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
842                 freecommand = 0;
843         }
844         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
845         command->opcode = opcode;
846         command->commandsize = size;
847         freecommand += size;
848         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
849                 freecommand = 0;
850         dpsoftrast.commandpool.freecommand = freecommand;
851         dpsoftrast.commandpool.usedcommands = usedcommands + size;
852         return command;
853 }
854
855 static void DPSOFTRAST_UndoCommand(int size)
856 {
857         int freecommand = dpsoftrast.commandpool.freecommand;
858         int usedcommands = dpsoftrast.commandpool.usedcommands;
859         freecommand -= size;
860         if (freecommand < 0)
861                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
862         usedcommands -= size;
863         dpsoftrast.commandpool.freecommand = freecommand;
864         dpsoftrast.commandpool.usedcommands = usedcommands;
865 }
866                 
867 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
868 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
869 {
870         thread->viewport[0] = command->x;
871         thread->viewport[1] = command->y;
872         thread->viewport[2] = command->width;
873         thread->viewport[3] = command->height;
874         thread->validate |= DPSOFTRAST_VALIDATE_FB;
875 }
876 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
877 {
878         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
879         command->x = x;
880         command->y = y;
881         command->width = width;
882         command->height = height;
883
884         dpsoftrast.viewport[0] = x;
885         dpsoftrast.viewport[1] = y;
886         dpsoftrast.viewport[2] = width;
887         dpsoftrast.viewport[3] = height;
888         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
889 }
890
891 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
892 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
893 {
894         int i, x1, y1, x2, y2, w, h, x, y;
895         int miny1 = thread->miny1;
896         int maxy1 = thread->maxy1;
897         int miny2 = thread->miny2;
898         int maxy2 = thread->maxy2;
899         int bandy;
900         unsigned int *p;
901         unsigned int c;
902         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
903         x1 = thread->fb_scissor[0];
904         y1 = thread->fb_scissor[1];
905         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
906         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
907         if (y1 < miny1) y1 = miny1;
908         if (y2 > maxy2) y2 = maxy2;
909         w = x2 - x1;
910         h = y2 - y1;
911         if (w < 1 || h < 1)
912                 return;
913         // FIXME: honor fb_colormask?
914         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
915         for (i = 0;i < 4;i++)
916         {
917                 if (!dpsoftrast.fb_colorpixels[i])
918                         continue;
919                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
920                 for (;y < bandy;y++)
921                 {
922                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
923                         for (x = x1;x < x2;x++)
924                                 p[x] = c;
925                 }
926         }
927 }
928 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
929 {
930         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
931         command->r = r;
932         command->g = g;
933         command->b = b;
934         command->a = a;
935 }
936
937 DEFCOMMAND(3, ClearDepth, float depth;)
938 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
939 {
940         int x1, y1, x2, y2, w, h, x, y;
941         int miny1 = thread->miny1;
942         int maxy1 = thread->maxy1;
943         int miny2 = thread->miny2;
944         int maxy2 = thread->maxy2;
945         int bandy;
946         unsigned int *p;
947         unsigned int c;
948         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
949         x1 = thread->fb_scissor[0];
950         y1 = thread->fb_scissor[1];
951         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
952         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
953         if (y1 < miny1) y1 = miny1;
954         if (y2 > maxy2) y2 = maxy2;
955         w = x2 - x1;
956         h = y2 - y1;
957         if (w < 1 || h < 1)
958                 return;
959         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
960         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
961         for (;y < bandy;y++)
962         {
963                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
964                 for (x = x1;x < x2;x++)
965                         p[x] = c;
966         }
967 }
968 void DPSOFTRAST_ClearDepth(float d)
969 {
970         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
971         command->depth = d;
972 }
973
974 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
975 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
976 {
977         thread->colormask[0] = command->r != 0;
978         thread->colormask[1] = command->g != 0;
979         thread->colormask[2] = command->b != 0;
980         thread->colormask[3] = command->a != 0;
981         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
982 }
983 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
984 {
985         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
986         command->r = r;
987         command->g = g;
988         command->b = b;
989         command->a = a;
990 }
991
992 DEFCOMMAND(5, DepthTest, int enable;)
993 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
994 {
995         thread->depthtest = command->enable;
996         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
997 }
998 void DPSOFTRAST_DepthTest(int enable)
999 {
1000         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1001         command->enable = enable;
1002 }
1003
1004 DEFCOMMAND(6, ScissorTest, int enable;)
1005 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1006 {
1007         thread->scissortest = command->enable;
1008         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1009 }
1010 void DPSOFTRAST_ScissorTest(int enable)
1011 {
1012         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1013         command->enable = enable;
1014 }
1015
1016 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1017 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1018 {
1019         thread->scissor[0] = command->x;
1020         thread->scissor[1] = command->y;
1021         thread->scissor[2] = command->width;
1022         thread->scissor[3] = command->height;
1023         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1024 }
1025 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1026 {
1027         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1028         command->x = x;
1029         command->y = y;
1030         command->width = width;
1031         command->height = height;
1032 }
1033
1034 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1035 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1036 {
1037         thread->blendfunc[0] = command->sfactor;
1038         thread->blendfunc[1] = command->dfactor;
1039         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1040 }
1041 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1042 {
1043         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1044         command->sfactor = sfactor;
1045         command->dfactor = dfactor;
1046 }
1047
1048 DEFCOMMAND(9, BlendSubtract, int enable;)
1049 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1050 {
1051         thread->blendsubtract = command->enable;
1052         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1053 }
1054 void DPSOFTRAST_BlendSubtract(int enable)
1055 {
1056         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1057         command->enable = enable;
1058 }
1059
1060 DEFCOMMAND(10, DepthMask, int enable;)
1061 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1062 {
1063         thread->depthmask = command->enable;
1064 }
1065 void DPSOFTRAST_DepthMask(int enable)
1066 {
1067         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1068         command->enable = enable;
1069 }
1070
1071 DEFCOMMAND(11, DepthFunc, int func;)
1072 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1073 {
1074         thread->depthfunc = command->func;
1075 }
1076 void DPSOFTRAST_DepthFunc(int func)
1077 {
1078         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1079         command->func = func;
1080 }
1081
1082 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1083 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1084 {
1085         thread->depthrange[0] = command->nearval;
1086         thread->depthrange[1] = command->farval;
1087 }
1088 void DPSOFTRAST_DepthRange(float nearval, float farval)
1089 {
1090         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1091         command->nearval = nearval;
1092         command->farval = farval;
1093 }
1094
1095 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1096 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1097 {
1098         thread->polygonoffset[0] = command->alongnormal;
1099         thread->polygonoffset[1] = command->intoview;
1100 }
1101 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1102 {
1103         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1104         command->alongnormal = alongnormal;
1105         command->intoview = intoview;
1106 }
1107
1108 DEFCOMMAND(14, CullFace, int mode;)
1109 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1110 {
1111         thread->cullface = command->mode;
1112 }
1113 void DPSOFTRAST_CullFace(int mode)
1114 {
1115         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1116         command->mode = mode;
1117 }
1118
1119 DEFCOMMAND(15, AlphaTest, int enable;)
1120 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1121 {
1122         thread->alphatest = command->enable;
1123 }
1124 void DPSOFTRAST_AlphaTest(int enable)
1125 {
1126         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1127         command->enable = enable;
1128 }
1129
1130 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1131 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1132 {
1133         thread->alphafunc = command->func;
1134         thread->alphavalue = command->ref;
1135 }
1136 void DPSOFTRAST_AlphaFunc(int func, float ref)
1137 {
1138         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1139         command->func = func;
1140         command->ref = ref;
1141 }
1142
1143 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1144 {
1145         dpsoftrast.color[0] = r;
1146         dpsoftrast.color[1] = g;
1147         dpsoftrast.color[2] = b;
1148         dpsoftrast.color[3] = a;
1149 }
1150
1151 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1152 {
1153         int outstride = blockwidth * 4;
1154         int instride = dpsoftrast.fb_width * 4;
1155         int bx1 = blockx;
1156         int by1 = blocky;
1157         int bx2 = blockx + blockwidth;
1158         int by2 = blocky + blockheight;
1159         int bw;
1160         int bh;
1161         int x;
1162         int y;
1163         unsigned char *inpixels;
1164         unsigned char *b;
1165         unsigned char *o;
1166         DPSOFTRAST_Flush();
1167         if (bx1 < 0) bx1 = 0;
1168         if (by1 < 0) by1 = 0;
1169         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1170         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1171         bw = bx2 - bx1;
1172         bh = by2 - by1;
1173         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1174         if (dpsoftrast.bigendian)
1175         {
1176                 for (y = by1;y < by2;y++)
1177                 {
1178                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1179                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1180                         for (x = bx1;x < bx2;x++)
1181                         {
1182                                 o[0] = b[3];
1183                                 o[1] = b[2];
1184                                 o[2] = b[1];
1185                                 o[3] = b[0];
1186                                 o += 4;
1187                                 b += 4;
1188                         }
1189                 }
1190         }
1191         else
1192         {
1193                 for (y = by1;y < by2;y++)
1194                 {
1195                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1196                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1197                         memcpy(o, b, bw*4);
1198                 }
1199         }
1200
1201 }
1202 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1203 {
1204         int tx1 = tx;
1205         int ty1 = ty;
1206         int tx2 = tx + width;
1207         int ty2 = ty + height;
1208         int sx1 = sx;
1209         int sy1 = sy;
1210         int sx2 = sx + width;
1211         int sy2 = sy + height;
1212         int swidth;
1213         int sheight;
1214         int twidth;
1215         int theight;
1216         int sw;
1217         int sh;
1218         int tw;
1219         int th;
1220         int y;
1221         unsigned int *spixels;
1222         unsigned int *tpixels;
1223         DPSOFTRAST_Texture *texture;
1224         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1225         if (mip < 0 || mip >= texture->mipmaps) return;
1226         if (texture->binds)
1227                 DPSOFTRAST_Flush();
1228         spixels = dpsoftrast.fb_colorpixels[0];
1229         swidth = dpsoftrast.fb_width;
1230         sheight = dpsoftrast.fb_height;
1231         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1232         twidth = texture->mipmap[mip][2];
1233         theight = texture->mipmap[mip][3];
1234         if (tx1 < 0) tx1 = 0;
1235         if (ty1 < 0) ty1 = 0;
1236         if (tx2 > twidth) tx2 = twidth;
1237         if (ty2 > theight) ty2 = theight;
1238         if (sx1 < 0) sx1 = 0;
1239         if (sy1 < 0) sy1 = 0;
1240         if (sx2 > swidth) sx2 = swidth;
1241         if (sy2 > sheight) sy2 = sheight;
1242         tw = tx2 - tx1;
1243         th = ty2 - ty1;
1244         sw = sx2 - sx1;
1245         sh = sy2 - sy1;
1246         if (tw > sw) tw = sw;
1247         if (th > sh) th = sh;
1248         if (tw < 1 || th < 1)
1249                 return;
1250         for (y = 0;y < th;y++)
1251                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 + y) * swidth + sx1), tw*4);
1252         if (texture->mipmaps > 1)
1253                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1254 }
1255
1256 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1257 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1258 {
1259         if (thread->texbound[command->unitnum])
1260                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1261         thread->texbound[command->unitnum] = command->texture;
1262 }
1263 void DPSOFTRAST_SetTexture(int unitnum, int index)
1264 {
1265         DPSOFTRAST_Command_SetTexture *command;
1266         DPSOFTRAST_Texture *texture;
1267         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1268         {
1269                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1270                 return;
1271         }
1272         texture = DPSOFTRAST_Texture_GetByIndex(index);
1273         if (index && !texture)
1274         {
1275                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1276                 return;
1277         }
1278
1279         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1280         command->unitnum = unitnum;
1281         command->texture = texture;
1282
1283         dpsoftrast.texbound[unitnum] = texture;
1284         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1285 }
1286
1287 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1288 {
1289         dpsoftrast.pointer_vertex3f = vertex3f;
1290         dpsoftrast.stride_vertex = stride;
1291 }
1292 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1293 {
1294         dpsoftrast.pointer_color4f = color4f;
1295         dpsoftrast.pointer_color4ub = NULL;
1296         dpsoftrast.stride_color = stride;
1297 }
1298 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1299 {
1300         dpsoftrast.pointer_color4f = NULL;
1301         dpsoftrast.pointer_color4ub = color4ub;
1302         dpsoftrast.stride_color = stride;
1303 }
1304 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1305 {
1306         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1307         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1308         dpsoftrast.stride_texcoord[unitnum] = stride;
1309 }
1310
1311 DEFCOMMAND(18, SetShader, int mode; int permutation;)
1312 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1313 {
1314         thread->shader_mode = command->mode;
1315         thread->shader_permutation = command->permutation;
1316 }
1317 void DPSOFTRAST_SetShader(int mode, int permutation)
1318 {
1319         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1320         command->mode = mode;
1321         command->permutation = permutation;
1322
1323         dpsoftrast.shader_mode = mode;
1324         dpsoftrast.shader_permutation = permutation;
1325 }
1326
1327 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1328 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1329 {
1330         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1331 }
1332 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1333 {
1334         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1335         command->index = index;
1336         command->val[0] = v0;
1337         command->val[1] = v1;
1338         command->val[2] = v2;
1339         command->val[3] = v3;
1340
1341         dpsoftrast.uniform4f[index*4+0] = v0;
1342         dpsoftrast.uniform4f[index*4+1] = v1;
1343         dpsoftrast.uniform4f[index*4+2] = v2;
1344         dpsoftrast.uniform4f[index*4+3] = v3;
1345 }
1346 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1347 {
1348         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1349         command->index = index;
1350         memcpy(command->val, v, sizeof(command->val));
1351
1352         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1353 }
1354
1355 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1356 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1357 {
1358         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1359 }
1360 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1361 {
1362 #ifdef SSE2_PRESENT
1363         int i, index;
1364         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1365         {
1366                 __m128 m0, m1, m2, m3;
1367                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1368                 command->index = index;
1369                 if (((size_t)v)&(ALIGN_SIZE-1))
1370                 {
1371                         m0 = _mm_loadu_ps(v);
1372                         m1 = _mm_loadu_ps(v+4);
1373                         m2 = _mm_loadu_ps(v+8);
1374                         m3 = _mm_loadu_ps(v+12);
1375                 }
1376                 else
1377                 {
1378                         m0 = _mm_load_ps(v);
1379                         m1 = _mm_load_ps(v+4);
1380                         m2 = _mm_load_ps(v+8);
1381                         m3 = _mm_load_ps(v+12);
1382                 }
1383                 if (transpose)
1384                 {
1385                         __m128 t0, t1, t2, t3;
1386                         t0 = _mm_unpacklo_ps(m0, m1);
1387                         t1 = _mm_unpacklo_ps(m2, m3);
1388                         t2 = _mm_unpackhi_ps(m0, m1);
1389                         t3 = _mm_unpackhi_ps(m2, m3);
1390                         m0 = _mm_movelh_ps(t0, t1);
1391                         m1 = _mm_movehl_ps(t1, t0);
1392                         m2 = _mm_movelh_ps(t2, t3);
1393                         m3 = _mm_movehl_ps(t3, t2);                     
1394                 }
1395                 _mm_store_ps(command->val, m0);
1396                 _mm_store_ps(command->val+4, m1);
1397                 _mm_store_ps(command->val+8, m2);
1398                 _mm_store_ps(command->val+12, m3);
1399                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1400                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1401                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1402                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1403         }
1404 #endif
1405 }
1406
1407 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1408 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1409 {
1410         thread->uniform1i[command->index] = command->val;
1411 }
1412 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1413 {
1414         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1415         command->index = index;
1416         command->val = i0;
1417
1418         dpsoftrast.uniform1i[command->index] = i0;
1419 }
1420
1421 #ifdef SSE2_PRESENT
1422 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1423 {
1424         float *end = dst + size*4;
1425         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1426         {
1427                 while (dst < end)
1428                 {
1429                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1430                         dst += 4;
1431                         src += stride;
1432                 }
1433         }
1434         else
1435         {
1436                 while (dst < end)
1437                 {
1438                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1439                         dst += 4;
1440                         src += stride;
1441                 }
1442         }
1443 }
1444
1445 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1446 {
1447         float *end = dst + size*4;
1448         if (stride == sizeof(float[3]))
1449         {
1450                 float *end4 = dst + (size&~3)*4;        
1451                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1452                 {
1453                         while (dst < end4)
1454                         {
1455                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1456                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1457                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1458                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1459                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1460                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1461                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1462                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1463                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1464                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1465                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1466                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1467                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1468                                 dst += 16;
1469                                 src += 4*sizeof(float[3]);
1470                         }
1471                 }
1472                 else
1473                 {
1474                         while (dst < end4)
1475                         {
1476                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1477                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1478                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1479                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1480                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1481                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1482                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1483                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1484                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1485                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1486                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1487                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1488                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1489                                 dst += 16;
1490                                 src += 4*sizeof(float[3]);
1491                         }
1492                 }
1493         }
1494         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1495         {
1496                 while (dst < end)
1497                 {
1498                         __m128 v = _mm_loadu_ps((const float *)src);
1499                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1500                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1501                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1502                         _mm_store_ps(dst, v);
1503                         dst += 4;
1504                         src += stride;
1505                 }
1506         }
1507         else
1508         {
1509                 while (dst < end)
1510                 {
1511                         __m128 v = _mm_load_ps((const float *)src);
1512                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1513                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1514                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1515                         _mm_store_ps(dst, v);
1516                         dst += 4;
1517                         src += stride;
1518                 }
1519         }
1520 }
1521
1522 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1523 {
1524         float *end = dst + size*4;
1525         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1526         if (stride == sizeof(float[2]))
1527         {
1528                 float *end2 = dst + (size&~1)*4;
1529                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1530                 {
1531                         while (dst < end2)
1532                         {
1533                                 __m128 v = _mm_loadu_ps((const float *)src);
1534                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1535                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1536                                 dst += 8;
1537                                 src += 2*sizeof(float[2]);
1538                         }
1539                 }
1540                 else
1541                 {
1542                         while (dst < end2)
1543                         {
1544                                 __m128 v = _mm_load_ps((const float *)src);
1545                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1546                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1547                                 dst += 8;
1548                                 src += 2*sizeof(float[2]);
1549                         }
1550                 }
1551         }
1552         while (dst < end)
1553         {
1554                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1555                 dst += 4;
1556                 src += stride;
1557         }
1558 }
1559
1560 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1561 {
1562         float *end = dst + size*4;
1563         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1564         if (stride == sizeof(unsigned char[4]))
1565         {
1566                 float *end4 = dst + (size&~3)*4;
1567                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1568                 {
1569                         while (dst < end4)
1570                         {
1571                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1572                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1573                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1574                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1575                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1576                                 dst += 16;
1577                                 src += 4*sizeof(unsigned char[4]);
1578                         }
1579                 }
1580                 else
1581                 {
1582                         while (dst < end4)
1583                         {
1584                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1585                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1586                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1587                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1588                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1589                                 dst += 16;
1590                                 src += 4*sizeof(unsigned char[4]);
1591                         }
1592                 }
1593         }
1594         while (dst < end)
1595         {
1596                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1597                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1598                 dst += 4;
1599                 src += stride;
1600         }
1601 }
1602
1603 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1604 {
1605         float *end = dst + 4*size;
1606         __m128 v = _mm_loadu_ps(src);
1607         while (dst < end)
1608         {
1609                 _mm_store_ps(dst, v);
1610                 dst += 4;
1611         }
1612 }
1613 #endif
1614
1615 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1616 {
1617 #ifdef SSE2_PRESENT
1618         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1619         __m128 m0, m1, m2, m3;
1620         float *end;
1621         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1622         {
1623                 // fast case for identity matrix
1624                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1625                 return;
1626         }
1627         end = out4f + numitems*4;
1628         m0 = _mm_loadu_ps(inmatrix16f);
1629         m1 = _mm_loadu_ps(inmatrix16f + 4);
1630         m2 = _mm_loadu_ps(inmatrix16f + 8);
1631         m3 = _mm_loadu_ps(inmatrix16f + 12);
1632         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1633         {
1634                 while (out4f < end)
1635                 {
1636                         __m128 v = _mm_loadu_ps(in4f);
1637                         _mm_store_ps(out4f,
1638                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1639                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1640                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1641                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1642                         out4f += 4;
1643                         in4f += 4;
1644                 }
1645         }
1646         else
1647         {
1648                 while (out4f < end)
1649                 {
1650                         __m128 v = _mm_load_ps(in4f);
1651                         _mm_store_ps(out4f,
1652                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1653                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1654                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1655                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1656                         out4f += 4;
1657                         in4f += 4;
1658                 }
1659         }
1660 #endif
1661 }
1662
1663 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1664 {
1665         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1666 }
1667
1668 #ifdef SSE2_PRESENT
1669 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1670 { \
1671         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1672         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1673         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1674         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1675 }
1676
1677 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1678 { \
1679         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1680         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1681         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1682         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1683 }
1684
1685 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1686 { \
1687         __m128 p = (in); \
1688         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1689                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1690                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1691                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1692 }
1693
1694 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1695 {
1696         int clipmask = 0xFF;
1697         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1698         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1699         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1700         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1701         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1702         #define BBFRONT(k, pos) \
1703         { \
1704                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1705                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1706                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1707                 { \
1708                         __m128 proj; \
1709                         clipmask &= ~(1<<k); \
1710                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1711                         minproj = _mm_min_ss(minproj, proj); \
1712                         maxproj = _mm_max_ss(maxproj, proj); \
1713                 } \
1714         }
1715         BBFRONT(0, minpos); 
1716         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1717         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1718         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1719         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1720         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1721         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1722         BBFRONT(7, maxpos);
1723         #define BBCLIP(k) \
1724         { \
1725                 if (clipmask&(1<<k)) \
1726                 { \
1727                         if (!(clipmask&(1<<(k^1)))) \
1728                         { \
1729                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1730                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1731                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1732                                 minproj = _mm_min_ss(minproj, proj); \
1733                                 maxproj = _mm_max_ss(maxproj, proj); \
1734                         } \
1735                         if (!(clipmask&(1<<(k^2)))) \
1736                         { \
1737                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1738                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1739                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1740                                 minproj = _mm_min_ss(minproj, proj); \
1741                                 maxproj = _mm_max_ss(maxproj, proj); \
1742                         } \
1743                         if (!(clipmask&(1<<(k^4)))) \
1744                         { \
1745                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1746                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1747                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1748                                 minproj = _mm_min_ss(minproj, proj); \
1749                                 maxproj = _mm_max_ss(maxproj, proj); \
1750                         } \
1751                 } \
1752         }
1753         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1754         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1755         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1756         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1757         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1758         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1759         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1760         *starty = _mm_cvttss_si32(maxproj);
1761         *endy = _mm_cvttss_si32(minproj)+1;
1762         return clipmask;
1763 }
1764 #endif
1765         
1766 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1767 {
1768 #ifdef SSE2_PRESENT
1769         float *end = out4f + numitems*4;
1770         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1771         __m128 minpos, maxpos;
1772         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1773         {
1774                 minpos = maxpos = _mm_loadu_ps(in4f);
1775                 while (out4f < end)
1776                 {
1777                         __m128 v = _mm_loadu_ps(in4f);
1778                         minpos = _mm_min_ps(minpos, v);
1779                         maxpos = _mm_max_ps(maxpos, v);
1780                         _mm_store_ps(out4f, v);
1781                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1782                         _mm_store_ps(screen4f, v);
1783                         in4f += 4;
1784                         out4f += 4;
1785                         screen4f += 4;
1786                 }
1787         }
1788         else
1789         {
1790                 minpos = maxpos = _mm_load_ps(in4f);
1791                 while (out4f < end)
1792                 {
1793                         __m128 v = _mm_load_ps(in4f);
1794                         minpos = _mm_min_ps(minpos, v);
1795                         maxpos = _mm_max_ps(maxpos, v);
1796                         _mm_store_ps(out4f, v);
1797                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1798                         _mm_store_ps(screen4f, v);
1799                         in4f += 4;
1800                         out4f += 4;
1801                         screen4f += 4;
1802                 }
1803         }
1804         if (starty && endy) 
1805                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1806                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1807                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1808                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1809                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1810         return 0;
1811 #endif
1812 }
1813
1814 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1815 {
1816 #ifdef SSE2_PRESENT
1817         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1818         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1819         float *end;
1820         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1821                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1822         end = out4f + numitems*4;
1823         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1824         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1825         m0 = _mm_loadu_ps(inmatrix16f);
1826         m1 = _mm_loadu_ps(inmatrix16f + 4);
1827         m2 = _mm_loadu_ps(inmatrix16f + 8);
1828         m3 = _mm_loadu_ps(inmatrix16f + 12);
1829         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1830         {
1831                 minpos = maxpos = _mm_loadu_ps(in4f);
1832                 while (out4f < end)
1833                 {
1834                         __m128 v = _mm_loadu_ps(in4f);
1835                         minpos = _mm_min_ps(minpos, v);
1836                         maxpos = _mm_max_ps(maxpos, v);
1837                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1838                         _mm_store_ps(out4f, v);
1839                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1840                         _mm_store_ps(screen4f, v);
1841                         in4f += 4;
1842                         out4f += 4;
1843                         screen4f += 4;
1844                 }
1845         }
1846         else
1847         {
1848                 minpos = maxpos = _mm_load_ps(in4f);
1849                 while (out4f < end)
1850                 {
1851                         __m128 v = _mm_load_ps(in4f);
1852                         minpos = _mm_min_ps(minpos, v);
1853                         maxpos = _mm_max_ps(maxpos, v);
1854                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1855                         _mm_store_ps(out4f, v);
1856                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1857                         _mm_store_ps(screen4f, v);
1858                         in4f += 4;
1859                         out4f += 4;
1860                         screen4f += 4;
1861                 }
1862         }
1863         if (starty && endy) 
1864                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1865         return 0;
1866 #endif
1867 }
1868
1869 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1870 {
1871         float *outf = dpsoftrast.post_array4f[outarray];
1872         const unsigned char *inb;
1873         int firstvertex = dpsoftrast.firstvertex;
1874         int numvertices = dpsoftrast.numvertices;
1875         int stride;
1876         switch(inarray)
1877         {
1878         case DPSOFTRAST_ARRAY_POSITION:
1879                 stride = dpsoftrast.stride_vertex;
1880                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1881                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1882                 break;
1883         case DPSOFTRAST_ARRAY_COLOR:
1884                 stride = dpsoftrast.stride_color;
1885                 if (dpsoftrast.pointer_color4f)
1886                 {
1887                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1888                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1889                 }
1890                 else if (dpsoftrast.pointer_color4ub)
1891                 {
1892                         stride = dpsoftrast.stride_color;
1893                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1894                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1895                 }
1896                 else
1897                 {
1898                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1899                 }
1900                 break;
1901         default:
1902                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1903                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1904                 {
1905                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1906                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1907                         {
1908                         case 2:
1909                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1910                                 break;
1911                         case 3:
1912                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1913                                 break;
1914                         case 4:
1915                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1916                                 break;
1917                         }
1918                 }
1919                 break;
1920         }
1921         return outf;
1922 }
1923
1924 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1925 {
1926         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1927         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1928         return data;
1929 }
1930
1931 #if 0
1932 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1933 {
1934         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1935         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1936         return data;
1937 }
1938 #endif
1939
1940 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1941 {
1942         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1943         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1944         return data;
1945 }
1946
1947 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1948 {
1949         int x;
1950         int startx = span->startx;
1951         int endx = span->endx;
1952         float wslope = triangle->w[0];
1953         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1954         float endz = 1.0f / (w + wslope * startx);
1955         for (x = startx;x < endx;)
1956         {
1957                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1958                 float z = endz, dz;
1959                 if (nextsub >= endx) nextsub = endsub = endx-1;
1960                 endz = 1.0f / (w + wslope * nextsub);
1961                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1962                 for (; x <= endsub; x++, z += dz)
1963                         zf[x] = z;
1964         }
1965 }
1966
1967 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1968 {
1969         int x;
1970         int startx = span->startx;
1971         int endx = span->endx;
1972         int d[4];
1973         float a, b;
1974         unsigned char * RESTRICT pixelmask = span->pixelmask;
1975         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1976         if (!pixel)
1977                 return;
1978         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1979         // handle alphatest now (this affects depth writes too)
1980         if (thread->alphatest)
1981                 for (x = startx;x < endx;x++)
1982                         if (in4f[x*4+3] < 0.5f)
1983                                 pixelmask[x] = false;
1984         // FIXME: this does not handle bigendian
1985         switch(thread->fb_blendmode)
1986         {
1987         case DPSOFTRAST_BLENDMODE_OPAQUE:
1988                 for (x = startx;x < endx;x++)
1989                 {
1990                         if (!pixelmask[x])
1991                                 continue;
1992                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
1993                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
1994                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
1995                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
1996                         pixel[x*4+0] = d[0];
1997                         pixel[x*4+1] = d[1];
1998                         pixel[x*4+2] = d[2];
1999                         pixel[x*4+3] = d[3];
2000                 }
2001                 break;
2002         case DPSOFTRAST_BLENDMODE_ALPHA:
2003                 for (x = startx;x < endx;x++)
2004                 {
2005                         if (!pixelmask[x])
2006                                 continue;
2007                         a = in4f[x*4+3] * 255.0f;
2008                         b = 1.0f - in4f[x*4+3];
2009                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2010                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2011                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2012                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2013                         pixel[x*4+0] = d[0];
2014                         pixel[x*4+1] = d[1];
2015                         pixel[x*4+2] = d[2];
2016                         pixel[x*4+3] = d[3];
2017                 }
2018                 break;
2019         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2020                 for (x = startx;x < endx;x++)
2021                 {
2022                         if (!pixelmask[x])
2023                                 continue;
2024                         a = in4f[x*4+3] * 255.0f;
2025                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2026                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2027                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2028                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2029                         pixel[x*4+0] = d[0];
2030                         pixel[x*4+1] = d[1];
2031                         pixel[x*4+2] = d[2];
2032                         pixel[x*4+3] = d[3];
2033                 }
2034                 break;
2035         case DPSOFTRAST_BLENDMODE_ADD:
2036                 for (x = startx;x < endx;x++)
2037                 {
2038                         if (!pixelmask[x])
2039                                 continue;
2040                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2041                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2042                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2043                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2044                         pixel[x*4+0] = d[0];
2045                         pixel[x*4+1] = d[1];
2046                         pixel[x*4+2] = d[2];
2047                         pixel[x*4+3] = d[3];
2048                 }
2049                 break;
2050         case DPSOFTRAST_BLENDMODE_INVMOD:
2051                 for (x = startx;x < endx;x++)
2052                 {
2053                         if (!pixelmask[x])
2054                                 continue;
2055                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2056                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2057                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2058                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2059                         pixel[x*4+0] = d[0];
2060                         pixel[x*4+1] = d[1];
2061                         pixel[x*4+2] = d[2];
2062                         pixel[x*4+3] = d[3];
2063                 }
2064                 break;
2065         case DPSOFTRAST_BLENDMODE_MUL:
2066                 for (x = startx;x < endx;x++)
2067                 {
2068                         if (!pixelmask[x])
2069                                 continue;
2070                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2071                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2072                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2073                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2074                         pixel[x*4+0] = d[0];
2075                         pixel[x*4+1] = d[1];
2076                         pixel[x*4+2] = d[2];
2077                         pixel[x*4+3] = d[3];
2078                 }
2079                 break;
2080         case DPSOFTRAST_BLENDMODE_MUL2:
2081                 for (x = startx;x < endx;x++)
2082                 {
2083                         if (!pixelmask[x])
2084                                 continue;
2085                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2086                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2087                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2088                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2089                         pixel[x*4+0] = d[0];
2090                         pixel[x*4+1] = d[1];
2091                         pixel[x*4+2] = d[2];
2092                         pixel[x*4+3] = d[3];
2093                 }
2094                 break;
2095         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2096                 for (x = startx;x < endx;x++)
2097                 {
2098                         if (!pixelmask[x])
2099                                 continue;
2100                         a = in4f[x*4+3] * -255.0f;
2101                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2102                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2103                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2104                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2105                         pixel[x*4+0] = d[0];
2106                         pixel[x*4+1] = d[1];
2107                         pixel[x*4+2] = d[2];
2108                         pixel[x*4+3] = d[3];
2109                 }
2110                 break;
2111         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2112                 for (x = startx;x < endx;x++)
2113                 {
2114                         if (!pixelmask[x])
2115                                 continue;
2116                         a = 255.0f;
2117                         b = 1.0f - in4f[x*4+3];
2118                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2119                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2120                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2121                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2122                         pixel[x*4+0] = d[0];
2123                         pixel[x*4+1] = d[1];
2124                         pixel[x*4+2] = d[2];
2125                         pixel[x*4+3] = d[3];
2126                 }
2127                 break;
2128         case DPSOFTRAST_BLENDMODE_INVADD:
2129                 for (x = startx;x < endx;x++)
2130                 {
2131                         if (!pixelmask[x])
2132                                 continue;
2133                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2134                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2135                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2136                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2137                         pixel[x*4+0] = d[0];
2138                         pixel[x*4+1] = d[1];
2139                         pixel[x*4+2] = d[2];
2140                         pixel[x*4+3] = d[3];
2141                 }
2142                 break;
2143         }
2144 }
2145
2146 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2147 {
2148 #ifdef SSE2_PRESENT
2149         int x;
2150         int startx = span->startx;
2151         int endx = span->endx;
2152         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2153         unsigned char * RESTRICT pixelmask = span->pixelmask;
2154         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2155         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2156         if (!pixel)
2157                 return;
2158         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2159         pixeli += span->y * dpsoftrast.fb_width + span->x;
2160         // handle alphatest now (this affects depth writes too)
2161         if (thread->alphatest)
2162                 for (x = startx;x < endx;x++)
2163                         if (in4ub[x*4+3] < 0.5f)
2164                                 pixelmask[x] = false;
2165         // FIXME: this does not handle bigendian
2166         switch(thread->fb_blendmode)
2167         {
2168         case DPSOFTRAST_BLENDMODE_OPAQUE:
2169                 for (x = startx;x + 4 <= endx;)
2170                 {
2171                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2172                         {
2173                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2174                                 x += 4;
2175                         }
2176                         else
2177                         {
2178                                 if (pixelmask[x])
2179                                         pixeli[x] = ini[x];
2180                                 x++;
2181                         }
2182                 }
2183                 for (;x < endx;x++)
2184                         if (pixelmask[x])
2185                                 pixeli[x] = ini[x];
2186                 break;
2187         case DPSOFTRAST_BLENDMODE_ALPHA:
2188         #define FINISHBLEND(blend2, blend1) \
2189                 for (x = startx;x + 2 <= endx;x += 2) \
2190                 { \
2191                         __m128i src, dst; \
2192                         switch (*(const unsigned short*)&pixelmask[x]) \
2193                         { \
2194                         case 0x0101: \
2195                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2196                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2197                                 blend2; \
2198                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2199                                 continue; \
2200                         case 0x0100: \
2201                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2202                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2203                                 blend1; \
2204                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2205                                 continue; \
2206                         case 0x0001: \
2207                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2208                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2209                                 blend1; \
2210                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2211                                 continue; \
2212                         } \
2213                         break; \
2214                 } \
2215                 for(;x < endx; x++) \
2216                 { \
2217                         __m128i src, dst; \
2218                         if (!pixelmask[x]) \
2219                                 continue; \
2220                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2222                         blend1; \
2223                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2224                 }
2225
2226                 FINISHBLEND({
2227                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2228                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2229                 }, {
2230                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2231                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2232                 });
2233                 break;
2234         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2235                 FINISHBLEND({
2236                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2238                 }, {
2239                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2241                 });
2242                 break;
2243         case DPSOFTRAST_BLENDMODE_ADD:
2244                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2245                 break;
2246         case DPSOFTRAST_BLENDMODE_INVMOD:
2247                 FINISHBLEND({
2248                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2249                 }, {
2250                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2251                 });
2252                 break;
2253         case DPSOFTRAST_BLENDMODE_MUL:
2254                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2255                 break;
2256         case DPSOFTRAST_BLENDMODE_MUL2:
2257                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2258                 break;
2259         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2260                 FINISHBLEND({
2261                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2263                 }, {
2264                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2266                 });
2267                 break;
2268         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2269                 FINISHBLEND({
2270                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2272                 }, {
2273                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2275                 });
2276                 break;
2277         case DPSOFTRAST_BLENDMODE_INVADD:
2278                 FINISHBLEND({
2279                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2280                 }, {
2281                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2282                 });
2283         }
2284 #endif
2285 }
2286
2287 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2288 {
2289         int x;
2290         int startx = span->startx;
2291         int endx = span->endx;
2292         int flags;
2293         float c[4];
2294         float data[4];
2295         float slope[4];
2296         float tc[2], endtc[2];
2297         float tcscale[2];
2298         unsigned int tci[2];
2299         unsigned int tci1[2];
2300         unsigned int tcimin[2];
2301         unsigned int tcimax[2];
2302         int tciwrapmask[2];
2303         int tciwidth;
2304         int filter;
2305         int mip;
2306         const unsigned char * RESTRICT pixelbase;
2307         const unsigned char * RESTRICT pixel[4];
2308         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2309         // if no texture is bound, just fill it with white
2310         if (!texture)
2311         {
2312                 for (x = startx;x < endx;x++)
2313                 {
2314                         out4f[x*4+0] = 1.0f;
2315                         out4f[x*4+1] = 1.0f;
2316                         out4f[x*4+2] = 1.0f;
2317                         out4f[x*4+3] = 1.0f;
2318                 }
2319                 return;
2320         }
2321         mip = triangle->mip[texunitindex];
2322         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2323         // if this mipmap of the texture is 1 pixel, just fill it with that color
2324         if (texture->mipmap[mip][1] == 4)
2325         {
2326                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2327                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2328                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2329                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2330                 for (x = startx;x < endx;x++)
2331                 {
2332                         out4f[x*4+0] = c[0];
2333                         out4f[x*4+1] = c[1];
2334                         out4f[x*4+2] = c[2];
2335                         out4f[x*4+3] = c[3];
2336                 }
2337                 return;
2338         }
2339         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2340         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2341         flags = texture->flags;
2342         tcscale[0] = texture->mipmap[mip][2];
2343         tcscale[1] = texture->mipmap[mip][3];
2344         tciwidth = texture->mipmap[mip][2];
2345         tcimin[0] = 0;
2346         tcimin[1] = 0;
2347         tcimax[0] = texture->mipmap[mip][2]-1;
2348         tcimax[1] = texture->mipmap[mip][3]-1;
2349         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2350         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2351         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2352         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2353         for (x = startx;x < endx;)
2354         {
2355                 unsigned int subtc[2];
2356                 unsigned int substep[2];
2357                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2358                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2359                 if (nextsub >= endx)
2360                 {
2361                         nextsub = endsub = endx-1;      
2362                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2363                 }
2364                 tc[0] = endtc[0];
2365                 tc[1] = endtc[1];
2366                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2367                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2368                 substep[0] = (endtc[0] - tc[0]) * subscale;
2369                 substep[1] = (endtc[1] - tc[1]) * subscale;
2370                 subtc[0] = tc[0] * (1<<16);
2371                 subtc[1] = tc[1] * (1<<16);
2372                 if (filter)
2373                 {
2374                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2375                         {
2376                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2377                                 {
2378                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2379                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2380                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2381                                         tci[0] = subtc[0]>>16;
2382                                         tci[1] = subtc[1]>>16;
2383                                         tci1[0] = tci[0] + 1;
2384                                         tci1[1] = tci[1] + 1;
2385                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2386                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2387                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2388                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2389                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2390                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2391                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2392                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2393                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2394                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2395                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2396                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2397                                         out4f[x*4+0] = c[0];
2398                                         out4f[x*4+1] = c[1];
2399                                         out4f[x*4+2] = c[2];
2400                                         out4f[x*4+3] = c[3];
2401                                 }
2402                         }
2403                         else
2404                         {
2405                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2406                                 {
2407                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2408                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2409                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2410                                         tci[0] = subtc[0]>>16;
2411                                         tci[1] = subtc[1]>>16;
2412                                         tci1[0] = tci[0] + 1;
2413                                         tci1[1] = tci[1] + 1;
2414                                         tci[0] &= tciwrapmask[0];
2415                                         tci[1] &= tciwrapmask[1];
2416                                         tci1[0] &= tciwrapmask[0];
2417                                         tci1[1] &= tciwrapmask[1];
2418                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2419                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2420                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2421                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2422                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2423                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2424                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2425                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2426                                         out4f[x*4+0] = c[0];
2427                                         out4f[x*4+1] = c[1];
2428                                         out4f[x*4+2] = c[2];
2429                                         out4f[x*4+3] = c[3];
2430                                 }
2431                         }
2432                 }
2433                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2434                 {
2435                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2436                         {
2437                                 tci[0] = subtc[0]>>16;
2438                                 tci[1] = subtc[1]>>16;
2439                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2440                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2441                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2442                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2443                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2444                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2445                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2446                                 out4f[x*4+0] = c[0];
2447                                 out4f[x*4+1] = c[1];
2448                                 out4f[x*4+2] = c[2];
2449                                 out4f[x*4+3] = c[3];
2450                         }
2451                 }
2452                 else
2453                 {
2454                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2455                         {
2456                                 tci[0] = subtc[0]>>16;
2457                                 tci[1] = subtc[1]>>16;
2458                                 tci[0] &= tciwrapmask[0];
2459                                 tci[1] &= tciwrapmask[1];
2460                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2461                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2462                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2463                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2464                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2465                                 out4f[x*4+0] = c[0];
2466                                 out4f[x*4+1] = c[1];
2467                                 out4f[x*4+2] = c[2];
2468                                 out4f[x*4+3] = c[3];
2469                         }
2470                 }
2471         }
2472 }
2473
2474 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2475 {
2476 #ifdef SSE2_PRESENT
2477         int x;
2478         int startx = span->startx;
2479         int endx = span->endx;
2480         int flags;
2481         __m128 data, slope, tcscale;
2482         __m128i tcsize, tcmask, tcoffset, tcmax;
2483         __m128 tc, endtc;
2484         __m128i subtc, substep, endsubtc;
2485         int filter;
2486         int mip;
2487         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2488         const unsigned char * RESTRICT pixelbase;
2489         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2490         // if no texture is bound, just fill it with white
2491         if (!texture)
2492         {
2493                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2494                 return;
2495         }
2496         mip = triangle->mip[texunitindex];
2497         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2498         // if this mipmap of the texture is 1 pixel, just fill it with that color
2499         if (texture->mipmap[mip][1] == 4)
2500         {
2501                 unsigned int k = *((const unsigned int *)pixelbase);
2502                 for (x = startx;x < endx;x++)
2503                         outi[x] = k;
2504                 return;
2505         }
2506         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2507         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2508         flags = texture->flags;
2509         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2510         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2511         tcscale = _mm_cvtepi32_ps(tcsize);
2512         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2513         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2514         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2515         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2516         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2517         tcmax = _mm_packs_epi32(tcmask, tcmask);
2518         for (x = startx;x < endx;)
2519         {
2520                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2521                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2522                 if (nextsub >= endx)
2523                 {
2524                         nextsub = endsub = endx-1;
2525                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2526                 }       
2527                 tc = endtc;
2528                 subtc = endsubtc;
2529                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2530                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2531                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2532                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2533                 substep = _mm_slli_epi32(substep, 1);
2534                 if (filter)
2535                 {
2536                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2537                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2538                         {
2539                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2540                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2541                                 {
2542                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2543                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2544                                         tci = _mm_madd_epi16(tci, tcoffset);
2545                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2546                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2547                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2548                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2549                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2550                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2551                                         fracm = _mm_srli_epi16(subtc, 1);
2552                                         pix1 = _mm_add_epi16(pix1,
2553                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2554                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2555                                         pix3 = _mm_add_epi16(pix3,
2556                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2557                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2558                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2559                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2560                                         pix2 = _mm_add_epi16(pix2,
2561                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2562                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2563                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2564                                 }
2565                                 if (x <= endsub)
2566                                 {
2567                                         const unsigned char * RESTRICT ptr1;
2568                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2569                                         tci = _mm_madd_epi16(tci, tcoffset);
2570                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2571                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2572                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2573                                         fracm = _mm_srli_epi16(subtc, 1);
2574                                         pix1 = _mm_add_epi16(pix1,
2575                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2576                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2577                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2578                                         pix1 = _mm_add_epi16(pix1,
2579                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2580                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2581                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2582                                         x++;
2583                                 }
2584                         }
2585                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2586                         {
2587                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2588                                 {
2589                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2590                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2591                                         tci = _mm_madd_epi16(tci, tcoffset);
2592                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2593                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2594                                                                                         _mm_setzero_si128());
2595                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2596                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2597                                                                                         _mm_setzero_si128());
2598                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2599                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2600                                         tci = _mm_madd_epi16(tci, tcoffset);
2601                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2602                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2603                                                                                         _mm_setzero_si128());
2604                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2605                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2606                                                                                         _mm_setzero_si128());
2607                                         fracm = _mm_srli_epi16(subtc, 1);
2608                                         pix1 = _mm_add_epi16(pix1,
2609                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2610                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2611                                         pix3 = _mm_add_epi16(pix3,
2612                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2613                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2614                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2615                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2616                                         pix2 = _mm_add_epi16(pix2,
2617                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2618                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2619                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2620                                 }
2621                                 if (x <= endsub)
2622                                 {
2623                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2624                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2625                                         tci = _mm_madd_epi16(tci, tcoffset);
2626                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2627                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2628                                                                                         _mm_setzero_si128());
2629                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2630                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2631                                                                                         _mm_setzero_si128());
2632                                         fracm = _mm_srli_epi16(subtc, 1);
2633                                         pix1 = _mm_add_epi16(pix1,
2634                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2635                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2636                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2637                                         pix1 = _mm_add_epi16(pix1,
2638                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2639                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2640                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2641                                         x++;
2642                                 }
2643                         }
2644                         else
2645                         {
2646                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2647                                 {
2648                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2649                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2650                                         tci = _mm_madd_epi16(tci, tcoffset);
2651                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2652                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2653                                                                                         _mm_setzero_si128());
2654                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2655                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2656                                                                                         _mm_setzero_si128());
2657                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2658                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2659                                         tci = _mm_madd_epi16(tci, tcoffset);
2660                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2661                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2662                                                                                         _mm_setzero_si128());
2663                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2664                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2665                                                                                         _mm_setzero_si128());
2666                                         fracm = _mm_srli_epi16(subtc, 1);
2667                                         pix1 = _mm_add_epi16(pix1,
2668                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2669                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2670                                         pix3 = _mm_add_epi16(pix3,
2671                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2672                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2673                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2674                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2675                                         pix2 = _mm_add_epi16(pix2,
2676                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2677                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2678                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2679                                 }
2680                                 if (x <= endsub)
2681                                 {
2682                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2683                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2684                                         tci = _mm_madd_epi16(tci, tcoffset);
2685                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2686                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2687                                                                                         _mm_setzero_si128());
2688                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2689                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2690                                                                                         _mm_setzero_si128());
2691                                         fracm = _mm_srli_epi16(subtc, 1);
2692                                         pix1 = _mm_add_epi16(pix1,
2693                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2694                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2695                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2696                                         pix1 = _mm_add_epi16(pix1,
2697                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2698                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2699                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2700                                         x++;
2701                                 }
2702                         }
2703                 }
2704                 else
2705                 {
2706                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2707                         {
2708                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2709                                 {
2710                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2711                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2712                                         tci = _mm_madd_epi16(tci, tcoffset);
2713                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2714                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2715                                 }
2716                                 if (x <= endsub)
2717                                 {
2718                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2719                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2720                                         tci = _mm_madd_epi16(tci, tcoffset);
2721                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2722                                         x++;
2723                                 }
2724                         }
2725                         else
2726                         {
2727                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2728                                 {
2729                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2730                                         tci = _mm_and_si128(tci, tcmax); 
2731                                         tci = _mm_madd_epi16(tci, tcoffset);
2732                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2733                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2734                                 }
2735                                 if (x <= endsub)
2736                                 {
2737                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2738                                         tci = _mm_and_si128(tci, tcmax); 
2739                                         tci = _mm_madd_epi16(tci, tcoffset);
2740                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2741                                         x++;
2742                                 }
2743                         }
2744                 }
2745         }
2746 #endif
2747 }
2748
2749 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2750 {
2751         // TODO: IMPLEMENT
2752         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2753 }
2754
2755 float DPSOFTRAST_SampleShadowmap(const float *vector)
2756 {
2757         // TODO: IMPLEMENT
2758         return 1.0f;
2759 }
2760
2761 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2762 {
2763         int x;
2764         int startx = span->startx;
2765         int endx = span->endx;
2766         float c[4];
2767         float data[4];
2768         float slope[4];
2769         float z;
2770         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2771         for (x = startx;x < endx;x++)
2772         {
2773                 z = zf[x];
2774                 c[0] = (data[0] + slope[0]*x) * z;
2775                 c[1] = (data[1] + slope[1]*x) * z;
2776                 c[2] = (data[2] + slope[2]*x) * z;
2777                 c[3] = (data[3] + slope[3]*x) * z;
2778                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2779                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2780                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2781                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2782         }
2783 }
2784
2785 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2786 {
2787         int x;
2788         int startx = span->startx;
2789         int endx = span->endx;
2790         float c[4];
2791         float data[4];
2792         float slope[4];
2793         float z;
2794         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2795         for (x = startx;x < endx;x++)
2796         {
2797                 z = zf[x];
2798                 c[0] = (data[0] + slope[0]*x) * z;
2799                 c[1] = (data[1] + slope[1]*x) * z;
2800                 c[2] = (data[2] + slope[2]*x) * z;
2801                 c[3] = (data[3] + slope[3]*x) * z;
2802                 out4f[x*4+0] = c[0];
2803                 out4f[x*4+1] = c[1];
2804                 out4f[x*4+2] = c[2];
2805                 out4f[x*4+3] = c[3];
2806         }
2807 }
2808
2809 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2810 {
2811         int x, startx = span->startx, endx = span->endx;
2812         float c[4], localcolor[4];
2813         localcolor[0] = subcolor[0];
2814         localcolor[1] = subcolor[1];
2815         localcolor[2] = subcolor[2];
2816         localcolor[3] = subcolor[3];
2817         for (x = startx;x < endx;x++)
2818         {
2819                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2820                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2821                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2822                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2823                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2824                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2825                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2826                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2827         }
2828 }
2829
2830 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2831 {
2832         int x, startx = span->startx, endx = span->endx;
2833         for (x = startx;x < endx;x++)
2834         {
2835                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2836                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2837                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2838                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2839         }
2840 }
2841
2842 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2843 {
2844         int x, startx = span->startx, endx = span->endx;
2845         for (x = startx;x < endx;x++)
2846         {
2847                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2848                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2849                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2850                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2851         }
2852 }
2853
2854 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2855 {
2856         int x, startx = span->startx, endx = span->endx;
2857         float a, b;
2858         for (x = startx;x < endx;x++)
2859         {
2860                 a = 1.0f - inb4f[x*4+3];
2861                 b = inb4f[x*4+3];
2862                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2863                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2864                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2865                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2866         }
2867 }
2868
2869 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2870 {
2871         int x, startx = span->startx, endx = span->endx;
2872         float localcolor[4], ilerp, lerp;
2873         localcolor[0] = color[0];
2874         localcolor[1] = color[1];
2875         localcolor[2] = color[2];
2876         localcolor[3] = color[3];
2877         ilerp = 1.0f - localcolor[3];
2878         lerp = localcolor[3];
2879         for (x = startx;x < endx;x++)
2880         {
2881                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2882                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2883                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2884                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2885         }
2886 }
2887
2888
2889
2890 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2891 {
2892 #ifdef SSE2_PRESENT
2893         int x;
2894         int startx = span->startx;
2895         int endx = span->endx;
2896         __m128 data, slope;
2897         __m128 mod, endmod;
2898         __m128i submod, substep, endsubmod;
2899         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2900         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2901         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2902         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2903         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2904         for (x = startx; x < endx;)
2905         {
2906                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2907                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2908                 if (nextsub >= endx)
2909                 {
2910                         nextsub = endsub = endx-1;
2911                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2912                 }
2913                 mod = endmod;
2914                 submod = endsubmod;
2915                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2916                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2917                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2918                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2919                 substep = _mm_packs_epi32(substep, substep);
2920                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2921                 {
2922                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2923                         pix = _mm_mulhi_epu16(pix, submod);
2924                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2925                 }
2926                 if (x <= endsub)
2927                 {
2928                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2929                         pix = _mm_mulhi_epu16(pix, submod);
2930                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2931                         x++;
2932                 }
2933         }
2934 #endif
2935 }
2936
2937 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2938 {
2939 #ifdef SSE2_PRESENT
2940         int x;
2941         int startx = span->startx;
2942         int endx = span->endx;
2943         __m128 data, slope;
2944         __m128 mod, endmod;
2945         __m128i submod, substep, endsubmod;
2946         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2947         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2948         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2949         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2950         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2951         for (x = startx; x < endx;)
2952         {
2953                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2954                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2955                 if (nextsub >= endx)
2956                 {
2957                         nextsub = endsub = endx-1;
2958                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2959                 }
2960                 mod = endmod;
2961                 submod = endsubmod;
2962                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2963                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2964                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2965                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2966                 substep = _mm_packs_epi32(substep, substep);
2967                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2968                 {
2969                         __m128i pix = _mm_srai_epi16(submod, 4);
2970                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2971                 }
2972                 if (x <= endsub)
2973                 {
2974                         __m128i pix = _mm_srai_epi16(submod, 4);
2975                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2976                         x++;
2977                 }
2978         }
2979 #endif
2980 }
2981
2982 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
2983 {
2984 #ifdef SSE2_PRESENT
2985         int x, startx = span->startx, endx = span->endx;
2986         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
2987         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
2988         for (x = startx;x+2 <= endx;x+=2)
2989         {
2990                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
2991                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
2992                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
2993                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
2994         }
2995         if (x < endx)
2996         {
2997                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
2998                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
2999                 pix1 = _mm_add_epi16(pix1, _mm_sub_epi16(pix2, localcolor));
3000                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3001         }
3002 #endif
3003 }
3004
3005 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3006 {
3007 #ifdef SSE2_PRESENT
3008         int x, startx = span->startx, endx = span->endx;
3009         for (x = startx;x+2 <= endx;x+=2)
3010         {
3011                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3012                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3013                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3014                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3015         }
3016         if (x < endx)
3017         {
3018                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3019                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3020                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3021                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3022         }
3023 #endif
3024 }
3025
3026 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3027 {
3028 #ifdef SSE2_PRESENT
3029         int x, startx = span->startx, endx = span->endx;
3030         for (x = startx;x+2 <= endx;x+=2)
3031         {
3032                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3033                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3034                 pix1 = _mm_add_epi16(pix1, pix2);
3035                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3036         }
3037         if (x < endx)
3038         {
3039                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3040                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3041                 pix1 = _mm_add_epi16(pix1, pix2);
3042                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3043         }
3044 #endif
3045 }
3046
3047 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3048 {
3049 #ifdef SSE2_PRESENT
3050         int x, startx = span->startx, endx = span->endx;
3051         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3052         tint = _mm_shuffle_epi32(_mm_packs_epi32(tint, tint), _MM_SHUFFLE(1, 0, 1, 0));
3053         for (x = startx;x+2 <= endx;x+=2)
3054         {
3055                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3056                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3057                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3058                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3059         }
3060         if (x < endx)
3061         {
3062                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3063                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3064                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3065                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3066         }
3067 #endif
3068 }
3069
3070 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3071 {
3072 #ifdef SSE2_PRESENT
3073         int x, startx = span->startx, endx = span->endx;
3074         for (x = startx;x+2 <= endx;x+=2)
3075         {
3076                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3077                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3078                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3079                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3080                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3081         }
3082         if (x < endx)
3083         {
3084                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3085                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3086                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3087                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3088                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3089         }
3090 #endif
3091 }
3092
3093 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3094 {
3095 #ifdef SSE2_PRESENT
3096         int x, startx = span->startx, endx = span->endx;
3097         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3098         localcolor = _mm_shuffle_epi32(_mm_packs_epi32(localcolor, localcolor), _MM_SHUFFLE(1, 0, 1, 0));
3099         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3100         for (x = startx;x+2 <= endx;x+=2)
3101         {
3102                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3103                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3104                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3105         }
3106         if (x < endx)
3107         {
3108                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3109                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3110                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3111         }
3112 #endif
3113 }
3114
3115
3116
3117 void DPSOFTRAST_VertexShader_Generic(void)
3118 {
3119         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3120         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3121         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3122         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3123                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3124 }
3125
3126 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3127 {
3128         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3129         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3130         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3131         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3132         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3133         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3134         {
3135                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3136                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3137                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3138                 {
3139                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3140                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3141                         {
3142                                 // multiply
3143                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3144                         }
3145                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3146                         {
3147                                 // add
3148                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3149                         }
3150                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3151                         {
3152                                 // alphablend
3153                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3154                         }
3155                 }
3156         }
3157         else
3158                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3159         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3160 }
3161
3162
3163
3164 void DPSOFTRAST_VertexShader_PostProcess(void)
3165 {
3166         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3167         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3168         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3169 }
3170
3171 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3172 {
3173         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3174         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3175         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3176         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3177         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3178         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3179         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3180         {
3181                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3182                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3183         }
3184         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3185         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3186         {
3187                 // TODO: implement saturation
3188         }
3189         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3190         {
3191                 // TODO: implement gammaramps
3192         }
3193         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3194 }
3195
3196
3197
3198 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3199 {
3200         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3201 }
3202
3203 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3204 {
3205         // this is never called (because colormask is off when this shader is used)
3206         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3207         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3208         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3209         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3210         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3211 }
3212
3213
3214
3215 void DPSOFTRAST_VertexShader_FlatColor(void)
3216 {
3217         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3218         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3219 }
3220
3221 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3222 {
3223         int x, startx = span->startx, endx = span->endx;
3224         int Color_Ambienti[4];
3225         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3226         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3227         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228         Color_Ambienti[2] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0]*256.0f);
3229         Color_Ambienti[1] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1]*256.0f);
3230         Color_Ambienti[0] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2]*256.0f);
3231         Color_Ambienti[3] = (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]        *256.0f);
3232         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3233         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3234         for (x = startx;x < endx;x++)
3235         {
3236                 buffer_FragColorbgra8[x*4+0] = (buffer_texture_colorbgra8[x*4+0] * Color_Ambienti[0])>>8;
3237                 buffer_FragColorbgra8[x*4+1] = (buffer_texture_colorbgra8[x*4+1] * Color_Ambienti[1])>>8;
3238                 buffer_FragColorbgra8[x*4+2] = (buffer_texture_colorbgra8[x*4+2] * Color_Ambienti[2])>>8;
3239                 buffer_FragColorbgra8[x*4+3] = (buffer_texture_colorbgra8[x*4+3] * Color_Ambienti[3])>>8;
3240         }
3241         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3242 }
3243
3244
3245
3246 void DPSOFTRAST_VertexShader_VertexColor(void)
3247 {
3248         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3249         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3250         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3251 }
3252
3253 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3254 {
3255 #ifdef SSE2_PRESENT
3256         unsigned char * RESTRICT pixelmask = span->pixelmask;
3257         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3258         int x, startx = span->startx, endx = span->endx;
3259         __m128i Color_Ambientm, Color_Diffusem;
3260         __m128 data, slope;
3261         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3262         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3263         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3264         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3265         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3266         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3267         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3268                 pixel = buffer_FragColorbgra8;
3269         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3270         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3271         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3272         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3273         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3274         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3275         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3276         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3277         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3278         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3279         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3280         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3281         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3282         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3283         {
3284                 __m128i color, mod, pix;
3285                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3286                 {
3287                         __m128i pix2, mod2;
3288                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3289                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3290                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3291                         data = _mm_add_ps(data, slope);
3292                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3293                         data = _mm_add_ps(data, slope);
3294                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3295                         data = _mm_add_ps(data, slope);
3296                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3297                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3298                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3299                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3300                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3301                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3302                         x += 3;
3303                         continue;
3304                 }
3305                 if (!pixelmask[x])
3306                         continue;
3307                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3308                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3309                 mod = _mm_packs_epi32(mod, mod);
3310                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3311                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3312         }
3313         if (pixel == buffer_FragColorbgra8)
3314                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3315 #endif
3316 }
3317
3318
3319
3320 void DPSOFTRAST_VertexShader_Lightmap(void)
3321 {
3322         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3323         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3324         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3325 }
3326
3327 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3328 {
3329 #ifdef SSE2_PRESENT
3330         unsigned char * RESTRICT pixelmask = span->pixelmask;
3331         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3332         int x, startx = span->startx, endx = span->endx;
3333         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3334         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3335         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3336         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3337         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3338         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3339         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3340         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3341         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3342         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3343                 pixel = buffer_FragColorbgra8;
3344         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3345         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3346         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3347         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3348         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3349         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3350         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3351         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3352         {
3353                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3354                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3355                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3356                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3357                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3358                 for (x = startx;x < endx;x++)
3359                 {
3360                         __m128i color, lightmap, glow, pix;
3361                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3362                         {
3363                                 __m128i pix2;
3364                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3365                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3366                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3367                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3368                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3369                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3370                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3371                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3372                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3373                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3374                                 x += 3;
3375                                 continue;
3376                         }
3377                         if (!pixelmask[x])
3378                                 continue;
3379                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3380                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3381                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3382                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3383                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3384                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3385                 }
3386         }
3387         else
3388         {
3389                 for (x = startx;x < endx;x++)
3390                 {
3391                         __m128i color, lightmap, pix;
3392                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3393                         {
3394                                 __m128i pix2;
3395                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3396                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3397                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3398                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3399                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3400                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3401                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3402                                 x += 3;
3403                                 continue;
3404                         }
3405                         if (!pixelmask[x]) 
3406                                 continue;
3407                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3408                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3409                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3410                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3411                 }
3412         }
3413         if (pixel == buffer_FragColorbgra8)
3414                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3415 #endif
3416 }
3417
3418
3419
3420 void DPSOFTRAST_VertexShader_FakeLight(void)
3421 {
3422         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3423 }
3424
3425 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3426 {
3427         // TODO: IMPLEMENT
3428         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3429         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3430         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3431         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3432         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3433 }
3434
3435
3436
3437 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3438 {
3439         DPSOFTRAST_VertexShader_Lightmap();
3440 }
3441
3442 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3443 {
3444         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3445         // TODO: IMPLEMENT
3446 }
3447
3448
3449
3450 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3451 {
3452         DPSOFTRAST_VertexShader_Lightmap();
3453 }
3454
3455 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3456 {
3457         DPSOFTRAST_PixelShader_Lightmap(thread, triangle, span);
3458         // TODO: IMPLEMENT
3459 }
3460
3461
3462
3463 void DPSOFTRAST_VertexShader_LightDirection(void)
3464 {
3465         int i;
3466         int numvertices = dpsoftrast.numvertices;
3467         float LightDir[4];
3468         float LightVector[4];
3469         float EyePosition[4];
3470         float EyeVectorModelSpace[4];
3471         float EyeVector[4];
3472         float position[4];
3473         float svector[4];
3474         float tvector[4];
3475         float normal[4];
3476         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3477         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3478         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3479         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3480         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3481         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3482         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3483         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3484         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3485         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3486         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3487         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3488         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3489         for (i = 0;i < numvertices;i++)
3490         {
3491                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3492                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3493                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3494                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3495                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3496                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3497                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3498                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3499                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3500                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3501                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3502                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3503                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3504                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3505                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3506                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3507                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3508                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3509                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3510                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3511                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3512                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3513                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3514                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3515                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3516                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3517                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3518                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3519                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3520         }
3521         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3522 }
3523
3524 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3525 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3526 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3527 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3528 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3529 #define DPSOFTRAST_Vector3Normalize(v)\
3530 do\
3531 {\
3532         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3533         if (len)\
3534         {\
3535                 len = 1.0f / len;\
3536                 v[0] *= len;\
3537                 v[1] *= len;\
3538                 v[2] *= len;\
3539         }\
3540 }\
3541 while(0)
3542
3543 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3544 {
3545         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3546         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3547         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3548         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3549         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3550         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3551         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3552         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3553         int x, startx = span->startx, endx = span->endx;
3554         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3555         float LightVectordata[4];
3556         float LightVectorslope[4];
3557         float EyeVectordata[4];
3558         float EyeVectorslope[4];
3559         float z;
3560         float diffusetex[4];
3561         float glosstex[4];
3562         float surfacenormal[4];
3563         float lightnormal[4];
3564         float eyenormal[4];
3565         float specularnormal[4];
3566         float diffuse;
3567         float specular;
3568         float SpecularPower;
3569         int d[4];
3570         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3571         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3572         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3573         Color_Glow[3] = 0.0f;
3574         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3575         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3576         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3577         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3578         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3579         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3580         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3581         Color_Pants[3] = 0.0f;
3582         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3583         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3584         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3585         Color_Shirt[3] = 0.0f;
3586         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3587         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3588         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3589         {
3590                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3591                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3592         }
3593         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3594         {
3595                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3596         }
3597         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3598         {
3599                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3600                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3601                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3602                 Color_Diffuse[3] = 0.0f;
3603                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3604                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3605                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3606                 LightColor[3] = 0.0f;
3607                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3608                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3609                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3610                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3611                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3612                 Color_Specular[3] = 0.0f;
3613                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3614                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3615                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3616                 for (x = startx;x < endx;x++)
3617                 {
3618                         z = buffer_z[x];
3619                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3620                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3621                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3622                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3623                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3624                         {
3625                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3626                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3627                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3628                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3629                         }
3630                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3631                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3632                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3633                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3634                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3635                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3636                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3637                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3638
3639                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3640                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3641                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3642                         DPSOFTRAST_Vector3Normalize(lightnormal);
3643
3644                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3645                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3646                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3647                         DPSOFTRAST_Vector3Normalize(eyenormal);
3648
3649                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3650                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3651                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3652                         DPSOFTRAST_Vector3Normalize(specularnormal);
3653
3654                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3655                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3656                         specular = pow(specular, SpecularPower * glosstex[3]);
3657                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3658                         {
3659                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3660                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3661                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3662                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3663                         }
3664                         else
3665                         {
3666                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3667                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3668                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3669                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3670                         }
3671                         buffer_FragColorbgra8[x*4+0] = d[0];
3672                         buffer_FragColorbgra8[x*4+1] = d[1];
3673                         buffer_FragColorbgra8[x*4+2] = d[2];
3674                         buffer_FragColorbgra8[x*4+3] = d[3];
3675                 }
3676         }
3677         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3678         {
3679                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3680                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3681                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3682                 Color_Diffuse[3] = 0.0f;
3683                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3684                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3685                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3686                 LightColor[3] = 0.0f;
3687                 DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3688                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3689                 for (x = startx;x < endx;x++)
3690                 {
3691                         z = buffer_z[x];
3692                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3693                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3694                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3695                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3696                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3697                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3698                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3699                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3700
3701                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3702                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3703                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3704                         DPSOFTRAST_Vector3Normalize(lightnormal);
3705
3706                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3707                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3708                         {
3709                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3710                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3711                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3712                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3713                         }
3714                         else
3715                         {
3716                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3717                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3718                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3719                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3720                         }
3721                         buffer_FragColorbgra8[x*4+0] = d[0];
3722                         buffer_FragColorbgra8[x*4+1] = d[1];
3723                         buffer_FragColorbgra8[x*4+2] = d[2];
3724                         buffer_FragColorbgra8[x*4+3] = d[3];
3725                 }
3726         }
3727         else
3728         {
3729                 for (x = startx;x < endx;x++)
3730                 {
3731                         z = buffer_z[x];
3732                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3733                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3734                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3735                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3736
3737                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3738                         {
3739                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3740                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3741                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3742                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3743                         }
3744                         else
3745                         {
3746                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3747                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3748                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3749                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3750                         }
3751                         buffer_FragColorbgra8[x*4+0] = d[0];
3752                         buffer_FragColorbgra8[x*4+1] = d[1];
3753                         buffer_FragColorbgra8[x*4+2] = d[2];
3754                         buffer_FragColorbgra8[x*4+3] = d[3];
3755                 }
3756         }
3757         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3758 }
3759
3760
3761
3762 void DPSOFTRAST_VertexShader_LightSource(void)
3763 {
3764         int i;
3765         int numvertices = dpsoftrast.numvertices;
3766         float LightPosition[4];
3767         float LightVector[4];
3768         float LightVectorModelSpace[4];
3769         float EyePosition[4];
3770         float EyeVectorModelSpace[4];
3771         float EyeVector[4];
3772         float position[4];
3773         float svector[4];
3774         float tvector[4];
3775         float normal[4];
3776         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
3777         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
3778         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
3779         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
3780         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3781         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3782         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3783         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3784         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3785         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3786         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3787         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3788         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3789         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3790         for (i = 0;i < numvertices;i++)
3791         {
3792                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3793                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3794                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3795                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3796                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3797                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3798                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3799                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3800                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3801                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3802                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3803                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3804                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
3805                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
3806                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
3807                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
3808                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
3809                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
3810                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
3811                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
3812                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
3813                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
3814                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3815                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3816                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3817                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3818                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3819                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3820                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
3821                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
3822                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
3823                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
3824         }
3825         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3826         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
3827 }
3828
3829 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3830 {
3831 #ifdef SSE2_PRESENT
3832         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3833         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3834         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3835         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3836         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3837         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3838         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3839         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3840         int x, startx = span->startx, endx = span->endx;
3841         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3842         float CubeVectordata[4];
3843         float CubeVectorslope[4];
3844         float LightVectordata[4];
3845         float LightVectorslope[4];
3846         float EyeVectordata[4];
3847         float EyeVectorslope[4];
3848         float z;
3849         float diffusetex[4];
3850         float glosstex[4];
3851         float surfacenormal[4];
3852         float lightnormal[4];
3853         float eyenormal[4];
3854         float specularnormal[4];
3855         float diffuse;
3856         float specular;
3857         float SpecularPower;
3858         float CubeVector[4];
3859         float attenuation;
3860         int d[4];
3861         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3862         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3863         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3864         Color_Glow[3] = 0.0f;
3865         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3866         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3867         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3868         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3869         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3870         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3871         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3872         Color_Diffuse[3] = 0.0f;
3873         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3874         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3875         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3876         Color_Specular[3] = 0.0f;
3877         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3878         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3879         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3880         Color_Pants[3] = 0.0f;
3881         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3882         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3883         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3884         Color_Shirt[3] = 0.0f;
3885         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3886         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3887         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3888         LightColor[3] = 0.0f;
3889         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3890         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3891         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3892         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3893         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3894         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
3895         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3896         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3897         {
3898                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3899                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3900         }
3901         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3902                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
3903         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3904         {
3905                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3906                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3907                 for (x = startx;x < endx;x++)
3908                 {
3909                         z = buffer_z[x];
3910                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3911                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3912                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3913                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3914                         if (attenuation < 0.01f)
3915                                 continue;
3916                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3917                         {
3918                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3919                                 if (attenuation < 0.01f)
3920                                         continue;
3921                         }
3922
3923                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3924                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3925                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3926                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3927                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3928                         {
3929                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3930                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3931                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3932                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3933                         }
3934                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3935                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3936                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3937                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3938                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3939                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3940                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3941                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3942
3943                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3944                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3945                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3946                         DPSOFTRAST_Vector3Normalize(lightnormal);
3947
3948                         eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3949                         eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3950                         eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3951                         DPSOFTRAST_Vector3Normalize(eyenormal);
3952
3953                         specularnormal[0] = lightnormal[0] + eyenormal[0];
3954                         specularnormal[1] = lightnormal[1] + eyenormal[1];
3955                         specularnormal[2] = lightnormal[2] + eyenormal[2];
3956                         DPSOFTRAST_Vector3Normalize(specularnormal);
3957
3958                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3959                         specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3960                         specular = pow(specular, SpecularPower * glosstex[3]);
3961                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
3962                         {
3963                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
3964                                 attenuation *= (1.0f / 255.0f);
3965                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
3966                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
3967                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
3968                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3969                         }
3970                         else
3971                         {
3972                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
3973                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
3974                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
3975                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
3976                         }
3977                         buffer_FragColorbgra8[x*4+0] = d[0];
3978                         buffer_FragColorbgra8[x*4+1] = d[1];
3979                         buffer_FragColorbgra8[x*4+2] = d[2];
3980                         buffer_FragColorbgra8[x*4+3] = d[3];
3981                 }
3982         }
3983         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3984         {
3985                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3986                 for (x = startx;x < endx;x++)
3987                 {
3988                         z = buffer_z[x];
3989                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
3990                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
3991                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
3992                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
3993                         if (attenuation < 0.01f)
3994                                 continue;
3995                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
3996                         {
3997                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
3998                                 if (attenuation < 0.01f)
3999                                         continue;
4000                         }
4001
4002                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4003                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4004                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4005                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4006                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4007                         {
4008                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4009                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4010                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4011                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4012                         }
4013                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4014                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4015                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4016                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4017
4018                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4019                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4020                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4021                         DPSOFTRAST_Vector3Normalize(lightnormal);
4022
4023                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4024                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4025                         {
4026                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4027                                 attenuation *= (1.0f / 255.0f);
4028                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4029                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4030                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4031                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4032                         }
4033                         else
4034                         {
4035                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4036                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4037                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4038                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4039                         }
4040                         buffer_FragColorbgra8[x*4+0] = d[0];
4041                         buffer_FragColorbgra8[x*4+1] = d[1];
4042                         buffer_FragColorbgra8[x*4+2] = d[2];
4043                         buffer_FragColorbgra8[x*4+3] = d[3];
4044                 }
4045         }
4046         else
4047         {
4048                 for (x = startx;x < endx;x++)
4049                 {
4050                         z = buffer_z[x];
4051                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4052                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4053                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4054                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4055                         if (attenuation < 0.01f)
4056                                 continue;
4057                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4058                         {
4059                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4060                                 if (attenuation < 0.01f)
4061                                         continue;
4062                         }
4063
4064                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4065                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4066                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4067                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4068                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4069                         {
4070                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4071                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4072                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4073                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4074                         }
4075                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4076                         {
4077                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4078                                 attenuation *= (1.0f / 255.0f);
4079                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4080                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4081                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4082                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4083                         }
4084                         else
4085                         {
4086                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4087                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4088                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4089                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4090                         }
4091                         buffer_FragColorbgra8[x*4+0] = d[0];
4092                         buffer_FragColorbgra8[x*4+1] = d[1];
4093                         buffer_FragColorbgra8[x*4+2] = d[2];
4094                         buffer_FragColorbgra8[x*4+3] = d[3];
4095                 }
4096         }
4097         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4098 #endif
4099 }
4100
4101
4102
4103 void DPSOFTRAST_VertexShader_Refraction(void)
4104 {
4105         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4106 }
4107
4108 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4109 {
4110         // TODO: IMPLEMENT
4111         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4112         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4113         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4114         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4115         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4116 }
4117
4118
4119
4120 void DPSOFTRAST_VertexShader_Water(void)
4121 {
4122         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4123 }
4124
4125
4126 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4127 {
4128         // TODO: IMPLEMENT
4129         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4130         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4131         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4132         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4133         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4134 }
4135
4136
4137
4138 void DPSOFTRAST_VertexShader_ShowDepth(void)
4139 {
4140         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4141 }
4142
4143 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4144 {
4145         // TODO: IMPLEMENT
4146         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4147         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4148         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4149         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4150         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4151 }
4152
4153
4154
4155 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4156 {
4157         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4158 }
4159
4160 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4161 {
4162         // TODO: IMPLEMENT
4163         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4164         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4165         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4166         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4167         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4168 }
4169
4170
4171
4172 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4173 {
4174         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4175 }
4176
4177 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4178 {
4179         // TODO: IMPLEMENT
4180         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4181         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4182         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4183         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4184         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4185 }
4186
4187
4188
4189 typedef struct DPSOFTRAST_ShaderModeInfo_s
4190 {
4191         int lodarrayindex;
4192         void (*Vertex)(void);
4193         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4194         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4195         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4196 }
4197 DPSOFTRAST_ShaderModeInfo;
4198
4199 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4200 {
4201         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4202         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4203         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4204         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4205         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4206         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4207         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {~0}, {~0}},
4208         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4209         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4210         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4211         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4212         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {~0}},
4213         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4214         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4215         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4216         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}}
4217 };
4218
4219 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4220 {
4221         int i;
4222         int x;
4223         int startx;
4224         int endx;
4225 //      unsigned int c;
4226 //      unsigned int *colorpixel;
4227         unsigned int *depthpixel;
4228         float w;
4229         float wslope;
4230         int depth;
4231         int depthslope;
4232         unsigned int d;
4233         DPSOFTRAST_State_Triangle *triangle;
4234         DPSOFTRAST_State_Span *span;
4235         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4236         for (i = 0; i < thread->numspans; i++)
4237         {
4238                 span = &thread->spans[i];
4239                 triangle = &thread->triangles[span->triangle];
4240                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4241                 {
4242                         wslope = triangle->w[0];
4243                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4244                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4245                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4246                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4247                         startx = span->startx;
4248                         endx = span->endx;
4249                         switch(thread->fb_depthfunc)
4250                         {
4251                         default:
4252                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4253                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4254                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4255                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4256                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4257                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4258                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4259                         }
4260                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4261                         //for (x = startx;x < endx;x++)
4262                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4263                         // if there is no color buffer, skip pixel shader
4264                         while (startx < endx && !pixelmask[startx])
4265                                 startx++;
4266                         while (endx > startx && !pixelmask[endx-1])
4267                                 endx--;
4268                         if (startx >= endx)
4269                                 continue; // no pixels to fill
4270                         span->pixelmask = pixelmask;
4271                         span->startx = startx;
4272                         span->endx = endx;
4273                         // run pixel shader if appropriate
4274                         // do this before running depthmask code, to allow the pixelshader
4275                         // to clear pixelmask values for alpha testing
4276                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4277                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4278                         if (thread->depthmask)
4279                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4280                                         if (pixelmask[x])
4281                                                 depthpixel[x] = d;
4282                 }
4283                 else
4284                 {
4285                         // no depth testing means we're just dealing with color...
4286                         // if there is no color buffer, skip pixel shader
4287                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4288                         {
4289                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4290                                 span->pixelmask = pixelmask;
4291                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4292                         }
4293                 }
4294         }
4295         thread->numspans = 0;
4296 }
4297
4298 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4299
4300 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4301 {
4302 #ifdef SSE2_PRESENT
4303         int cullface = thread->cullface;
4304         int minx, maxx, miny, maxy;
4305         int miny1, maxy1, miny2, maxy2;
4306         __m128i fbmin, fbmax;
4307         __m128 viewportcenter, viewportscale;
4308         int firstvertex = command->firstvertex;
4309         int numvertices = command->numvertices;
4310         int numtriangles = command->numtriangles;
4311         const int *element3i = command->element3i;
4312         const unsigned short *element3s = command->element3s;
4313         int clipped = command->clipped;
4314         int i;
4315         int j;
4316         int k;
4317         int y;
4318         int e[3];
4319         __m128i screeny;
4320         int starty, endy, bandy;
4321         int numpoints;
4322         int clipcase;
4323         float clipdist[4];
4324         __m128 triangleedge1, triangleedge2, trianglenormal;
4325         __m128 clipfrac[3];
4326         __m128 screen[4];
4327         DPSOFTRAST_State_Triangle *triangle;
4328         DPSOFTRAST_Texture *texture;
4329         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4330         miny = thread->fb_scissor[1];
4331         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4332         miny1 = bound(miny, thread->miny1, maxy);
4333         maxy1 = bound(miny, thread->maxy1, maxy);
4334         miny2 = bound(miny, thread->miny2, maxy);
4335         maxy2 = bound(miny, thread->maxy2, maxy);
4336         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4337         {
4338                 if (!ATOMIC_DECREMENT(command->refcount))
4339                 {
4340                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4341                                 MM_FREE(command->arrays);
4342                 }
4343                 return;
4344         }
4345         minx = thread->fb_scissor[0];
4346         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4347         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4348         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4349         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4350         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4351         screen[3] = _mm_setzero_ps();
4352         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4353         for (i = 0;i < numtriangles;i++)
4354         {
4355                 const float *screencoord4f = command->arrays;
4356                 const float *arrays = screencoord4f + numvertices*4;
4357
4358                 // generate the 3 edges of this triangle
4359                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4360                 if (element3s)
4361                 {
4362                         e[0] = element3s[i*3+0] - firstvertex;
4363                         e[1] = element3s[i*3+1] - firstvertex;
4364                         e[2] = element3s[i*3+2] - firstvertex;
4365                 }
4366                 else if (element3i)
4367                 {
4368                         e[0] = element3i[i*3+0] - firstvertex;
4369                         e[1] = element3i[i*3+1] - firstvertex;
4370                         e[2] = element3i[i*3+2] - firstvertex;
4371                 }
4372                 else
4373                 {
4374                         e[0] = i*3+0;
4375                         e[1] = i*3+1;
4376                         e[2] = i*3+2;
4377                 }
4378
4379 #define SKIPBACKFACE \
4380                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4381                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4382                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4383                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4384                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4385                 switch(cullface) \
4386                 { \
4387                 case GL_BACK: \
4388                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4389                                 continue; \
4390                         break; \
4391                 case GL_FRONT: \
4392                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4393                                 continue; \
4394                         break; \
4395                 }
4396
4397 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4398                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4399                         { \
4400                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4401                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4402                         }
4403 #define CLIPPEDVERTEXCOPY(k,p1) \
4404                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4405
4406 #define GENATTRIBCOPY(attrib, p1) \
4407                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4408 #define GENATTRIBLERP(attrib, p1, p2) \
4409                 { \
4410                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4411                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4412                 }
4413 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4414                 switch(clipcase) \
4415                 { \
4416                 default: \
4417                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4418                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4419                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4420                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4421                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4422                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4423                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4424                 }
4425
4426                 if (! clipped)
4427                         goto notclipped;
4428
4429                 // calculate distance from nearplane
4430                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4431                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4432                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4433                 if (clipdist[0] >= 0.0f)
4434                 {
4435                         if (clipdist[1] >= 0.0f)
4436                         {
4437                                 if (clipdist[2] >= 0.0f)
4438                                 {
4439                                 notclipped:
4440                                         // triangle is entirely in front of nearplane
4441                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4442                                         SKIPBACKFACE;
4443                                         numpoints = 3;
4444                                         clipcase = 0;
4445                                 }
4446                                 else
4447                                 {
4448                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4449                                         SKIPBACKFACE;
4450                                         numpoints = 4;
4451                                         clipcase = 1;
4452                                 }
4453                         }
4454                         else
4455                         {
4456                                 if (clipdist[2] >= 0.0f)
4457                                 {
4458                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4459                                         SKIPBACKFACE;
4460                                         numpoints = 4;
4461                                         clipcase = 2;
4462                                 }
4463                                 else
4464                                 {
4465                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4466                                         SKIPBACKFACE;
4467                                         numpoints = 3;
4468                                         clipcase = 3;
4469                                 }
4470                         }
4471                 }
4472                 else if (clipdist[1] >= 0.0f)
4473                 {
4474                         if (clipdist[2] >= 0.0f)
4475                         {
4476                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4477                                 SKIPBACKFACE;
4478                                 numpoints = 4;
4479                                 clipcase = 4;
4480                         }
4481                         else
4482                         {
4483                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4484                                 SKIPBACKFACE;
4485                                 numpoints = 3;
4486                                 clipcase = 5;
4487                         }
4488                 }
4489                 else if (clipdist[2] >= 0.0f)
4490                 {
4491                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4492                         SKIPBACKFACE;
4493                         numpoints = 3;
4494                         clipcase = 6;
4495                 }
4496                 else continue; // triangle is entirely behind nearplane
4497
4498                 {
4499                         // calculate integer y coords for triangle points
4500                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4501                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4502                                         screenmin = _mm_min_epi16(screeni, screenir),
4503                                         screenmax = _mm_max_epi16(screeni, screenir);
4504                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4505                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4506                         screenmin = _mm_max_epi16(screenmin, fbmin);
4507                         screenmax = _mm_min_epi16(screenmax, fbmax);
4508                         // skip offscreen triangles
4509                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4510                                 continue;
4511                         starty = _mm_extract_epi16(screenmin, 1);
4512                         endy = _mm_extract_epi16(screenmax, 1)+1;
4513                         if (starty >= maxy1 && endy <= miny2)
4514                                 continue;
4515                         screeny = _mm_srai_epi32(screeni, 16);
4516                 }
4517
4518                 triangle = &thread->triangles[thread->numtriangles];
4519
4520                 // calculate attribute plans for triangle data...
4521                 // okay, this triangle is going to produce spans, we'd better project
4522                 // the interpolants now (this is what gives perspective texturing),
4523                 // this consists of simply multiplying all arrays by the W coord
4524                 // (which is basically 1/Z), which will be undone per-pixel
4525                 // (multiplying by Z again) to get the perspective-correct array
4526                 // values
4527                 {
4528                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4529                         __m128 mipedgescale, mipdensity;
4530                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4531                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4532                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4533                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4534                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4535                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4536                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4537                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4538                         attribedge1 = _mm_sub_ss(w0, w1);
4539                         attribedge2 = _mm_sub_ss(w2, w1);
4540                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4541                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4542                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4543                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4544                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4545                         _mm_store_ss(&triangle->w[0], attribxslope);
4546                         _mm_store_ss(&triangle->w[1], attribyslope);
4547                         _mm_store_ss(&triangle->w[2], attriborigin);
4548                         mipedgescale = _mm_setzero_ps();
4549                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4550                         {
4551                                 __m128 attrib0, attrib1, attrib2;
4552                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4553                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4554                                         break;
4555                                 arrays += numvertices*4;
4556                                 GENATTRIBS(attrib0, attrib1, attrib2);
4557                                 attriborigin = _mm_mul_ps(attrib1, w1);
4558                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4559                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4560                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4561                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4562                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4563                                 _mm_stream_ps(triangle->attribs[k][0], attribxslope);
4564                                 _mm_stream_ps(triangle->attribs[k][1], attribyslope);
4565                                 _mm_stream_ps(triangle->attribs[k][2], attriborigin);
4566                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4567                                 {
4568                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4569                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4570                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4571                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4572                                 }
4573                         }
4574
4575                         memset(triangle->mip, 0, sizeof(triangle->mip));
4576                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4577                         {
4578                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4579                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4580                                         break;
4581                                 texture = thread->texbound[texunit];
4582                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4583                                 {
4584                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4585                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4586                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4587                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4588                                         // this will be multiplied in the texturing routine by the texture resolution
4589                                         y = _mm_cvtss_si32(mipdensity);
4590                                         if (y > 0)
4591                                         {
4592                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4593                                                 if (y > texture->mipmaps - 1)
4594                                                         y = texture->mipmaps - 1;
4595                                                 triangle->mip[texunit] = y;
4596                                         }
4597                                 }
4598                         }
4599                 }
4600         
4601                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4602                 for (; y < bandy;)
4603                 {
4604                         __m128 xcoords, xslope;
4605                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4606                         int yccmask = _mm_movemask_epi8(ycc);
4607                         int edge0p, edge0n, edge1p, edge1n;
4608                         int nexty;
4609                         if (numpoints == 4)
4610                         {
4611                                 switch(yccmask)
4612                                 {
4613                                 default:
4614                                 case 0xFFFF: /*0000*/ y = endy; continue;
4615                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4616                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4617                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4618                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4619                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4620                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4621                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4622                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4623                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4624                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4625                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4626                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4627                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4628                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4629                                 case 0x0000: /*1111*/ y++; continue;
4630                                 }
4631                         }
4632                         else
4633                         {
4634                                 switch(yccmask)
4635                                 {
4636                                 default:
4637                                 case 0xFFFF: /*000*/ y = endy; continue;
4638                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
4639                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4640                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
4641                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
4642                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4643                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
4644                                 case 0x0000: /*111*/ y++; continue;
4645                                 }
4646                         }
4647                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
4648                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
4649                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
4650                         nexty = _mm_extract_epi16(ycc, 0);
4651                         if (nexty >= bandy) nexty = bandy-1;
4652                         if (_mm_ucomigt_ss(_mm_max_ss(screen[edge0n], screen[edge0p]), _mm_min_ss(screen[edge1n], screen[edge1p])))
4653                         {
4654                                 int tmp = edge0n;
4655                                 edge0n = edge1n;
4656                                 edge1n = tmp;
4657                                 tmp = edge0p;
4658                                 edge0p = edge1p;
4659                                 edge1p = tmp;
4660                         }
4661                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
4662                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
4663                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
4664                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
4665                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
4666                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
4667                         {
4668                                 int startx, endx, offset;
4669                                 startx = _mm_cvtss_si32(xcoords);
4670                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
4671                                 if (startx < minx) 
4672                                 {
4673                                         if (startx < 0) startx = 0;
4674                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
4675                                 }
4676                                 if (endx > maxx) endx = maxx;
4677                                 if (startx >= endx) continue;
4678                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
4679                                 {
4680                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
4681                                         span->triangle = thread->numtriangles;
4682                                         span->x = offset;
4683                                         span->y = y;
4684                                         span->startx = max(minx - offset, 0);
4685                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
4686                                         if (span->startx >= span->endx)
4687                                                 continue; 
4688                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
4689                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
4690                                 }
4691                         }
4692                 }
4693
4694                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
4695                 {
4696                         DPSOFTRAST_Draw_ProcessSpans(thread);
4697                         thread->numtriangles = 0;
4698                 }
4699         }
4700
4701         if (!ATOMIC_DECREMENT(command->refcount))
4702         {
4703                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4704                         MM_FREE(command->arrays);
4705         }
4706
4707         if (thread->numspans > 0 || thread->numtriangles > 0)
4708         {
4709                 DPSOFTRAST_Draw_ProcessSpans(thread);
4710                 thread->numtriangles = 0;
4711         }
4712 #endif
4713 }
4714
4715 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4716 {
4717         int i;
4718         int j;
4719         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
4720         int datasize = 2*numvertices*sizeof(float[4]);
4721         DPSOFTRAST_Command_Draw *command;
4722         unsigned char *data;
4723         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4724         {
4725                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4726                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4727                         break;
4728                 datasize += numvertices*sizeof(float[4]);
4729         }
4730         if (element3s)
4731                 datasize += numtriangles*sizeof(unsigned short[3]);
4732         else if (element3i)
4733                 datasize += numtriangles*sizeof(int[3]);
4734         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
4735         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
4736         {
4737                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
4738                 data = (unsigned char *)MM_CALLOC(datasize, 1);
4739         }
4740         else
4741         {
4742                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
4743                 data = (unsigned char *)command + commandsize;
4744         }
4745         command->firstvertex = firstvertex;
4746         command->numvertices = numvertices;
4747         command->numtriangles = numtriangles;
4748         command->arrays = (float *)data;
4749         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
4750         dpsoftrast.firstvertex = firstvertex;
4751         dpsoftrast.numvertices = numvertices;
4752         dpsoftrast.screencoord4f = (float *)data;
4753         data += numvertices*sizeof(float[4]);
4754         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
4755         data += numvertices*sizeof(float[4]);
4756         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
4757         {
4758                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
4759                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
4760                         break;
4761                 dpsoftrast.post_array4f[j] = (float *)data;
4762                 data += numvertices*sizeof(float[4]);
4763         }
4764         command->element3i = NULL;
4765         command->element3s = NULL;
4766         if (element3s)
4767         {
4768                 command->element3s = (unsigned short *)data;
4769                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
4770         }
4771         else if (element3i)
4772         {
4773                 command->element3i = (int *)data;
4774                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
4775         }
4776         return command;
4777 }
4778
4779 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
4780 {
4781         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
4782         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
4783         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
4784         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
4785         if (command->starty >= command->endy)
4786         {
4787                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4788                         MM_FREE(command->arrays);
4789                 DPSOFTRAST_UndoCommand(command->commandsize);
4790                 return;
4791         }
4792         command->clipped = dpsoftrast.drawclipped;
4793         command->refcount = dpsoftrast.numthreads;
4794
4795 #ifdef USE_THREADS
4796         DPSOFTRAST_Draw_SyncCommands();
4797         {
4798                 int i;
4799                 for (i = 0; i < dpsoftrast.numthreads; i++)
4800                 {
4801                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4802                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
4803                                 SDL_CondSignal(thread->drawcond);
4804                 }
4805         }
4806 #else
4807         DPSOFTRAST_Draw_FlushThreads();
4808 #endif
4809 }
4810  
4811 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
4812 {
4813         int commandoffset = thread->commandoffset;
4814         while (commandoffset != endoffset)
4815         {
4816                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
4817                 switch (command->opcode)
4818                 {
4819 #define INTERPCOMMAND(name) \
4820                 case DPSOFTRAST_OPCODE_##name : \
4821                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
4822                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
4823                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
4824                                 commandoffset = 0; \
4825                         break;
4826                 INTERPCOMMAND(Viewport)
4827                 INTERPCOMMAND(ClearColor)
4828                 INTERPCOMMAND(ClearDepth)
4829                 INTERPCOMMAND(ColorMask)
4830                 INTERPCOMMAND(DepthTest)
4831                 INTERPCOMMAND(ScissorTest)
4832                 INTERPCOMMAND(Scissor)
4833                 INTERPCOMMAND(BlendFunc)
4834                 INTERPCOMMAND(BlendSubtract)
4835                 INTERPCOMMAND(DepthMask)
4836                 INTERPCOMMAND(DepthFunc)
4837                 INTERPCOMMAND(DepthRange)
4838                 INTERPCOMMAND(PolygonOffset)
4839                 INTERPCOMMAND(CullFace)
4840                 INTERPCOMMAND(AlphaTest)
4841                 INTERPCOMMAND(AlphaFunc)
4842                 INTERPCOMMAND(SetTexture)
4843                 INTERPCOMMAND(SetShader)
4844                 INTERPCOMMAND(Uniform4f)
4845                 INTERPCOMMAND(UniformMatrix4f)
4846                 INTERPCOMMAND(Uniform1i)
4847
4848                 case DPSOFTRAST_OPCODE_Draw:
4849                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
4850                         commandoffset += command->commandsize;
4851                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
4852                                 commandoffset = 0;
4853                         thread->commandoffset = commandoffset;
4854                         break;
4855
4856                 case DPSOFTRAST_OPCODE_Reset:
4857                         commandoffset = 0;
4858                         break;
4859                 }
4860         }
4861         thread->commandoffset = commandoffset;
4862 }
4863
4864 #ifdef USE_THREADS
4865 static int DPSOFTRAST_Draw_Thread(void *data)
4866 {
4867         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
4868         while(thread->index >= 0)
4869         {
4870                 if (thread->commandoffset != dpsoftrast.drawcommand)
4871                 {
4872                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
4873                 }
4874                 else 
4875                 {
4876                         SDL_LockMutex(thread->drawmutex);
4877                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
4878                         {
4879                                 if (thread->waiting) SDL_CondSignal(thread->waitcond);
4880                                 thread->starving = true;
4881                                 SDL_CondWait(thread->drawcond, thread->drawmutex);
4882                                 thread->starving = false;
4883                         }
4884                         SDL_UnlockMutex(thread->drawmutex);
4885                 }
4886         }   
4887         return 0;
4888 }
4889 #endif
4890
4891 static void DPSOFTRAST_Draw_FlushThreads(void)
4892 {
4893         DPSOFTRAST_State_Thread *thread;
4894         int i;
4895         DPSOFTRAST_Draw_SyncCommands();
4896 #ifdef USE_THREADS
4897         for (i = 0; i < dpsoftrast.numthreads; i++)
4898         {
4899                 thread = &dpsoftrast.threads[i];
4900                 if (thread->commandoffset != dpsoftrast.drawcommand)
4901                 {
4902                         SDL_LockMutex(thread->drawmutex);
4903                         if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
4904                                 SDL_CondSignal(thread->drawcond);
4905                         SDL_UnlockMutex(thread->drawmutex);
4906                 }
4907         }
4908 #endif                  
4909         for (i = 0; i < dpsoftrast.numthreads; i++)
4910         {
4911                 thread = &dpsoftrast.threads[i];
4912 #ifdef USE_THREADS
4913                 if (thread->commandoffset != dpsoftrast.drawcommand)
4914                 {
4915                         SDL_LockMutex(thread->drawmutex);
4916                         if (thread->commandoffset != dpsoftrast.drawcommand)
4917                         {
4918                                 thread->waiting = true;
4919                                 SDL_CondWait(thread->waitcond, thread->drawmutex);
4920                                 thread->waiting = false;
4921                         }
4922                         SDL_UnlockMutex(thread->drawmutex);
4923                 }
4924 #else
4925                 if (thread->commandoffset != dpsoftrast.drawcommand)
4926                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
4927 #endif
4928         }
4929         dpsoftrast.commandpool.usedcommands = 0;
4930 }
4931
4932 void DPSOFTRAST_Flush(void)
4933 {
4934         DPSOFTRAST_Draw_FlushThreads();
4935 }
4936
4937 void DPSOFTRAST_Finish(void)
4938 {
4939         DPSOFTRAST_Flush();
4940 }
4941
4942 void DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
4943 {
4944         int i;
4945         union
4946         {
4947                 int i;
4948                 unsigned char b[4];
4949         }
4950         u;
4951         u.i = 1;
4952         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
4953         dpsoftrast.bigendian = u.b[3];
4954         dpsoftrast.fb_width = width;
4955         dpsoftrast.fb_height = height;
4956         dpsoftrast.fb_depthpixels = depthpixels;
4957         dpsoftrast.fb_colorpixels[0] = colorpixels;
4958         dpsoftrast.fb_colorpixels[1] = NULL;
4959         dpsoftrast.fb_colorpixels[1] = NULL;
4960         dpsoftrast.fb_colorpixels[1] = NULL;
4961         dpsoftrast.viewport[0] = 0;
4962         dpsoftrast.viewport[1] = 0;
4963         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
4964         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
4965         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
4966         dpsoftrast.texture_firstfree = 1;
4967         dpsoftrast.texture_end = 1;
4968         dpsoftrast.texture_max = 0;
4969         dpsoftrast.color[0] = 1;
4970         dpsoftrast.color[1] = 1;
4971         dpsoftrast.color[2] = 1;
4972         dpsoftrast.color[3] = 1;
4973         dpsoftrast.interlace = bound(0, interlace, 1);
4974 #ifdef USE_THREADS
4975         dpsoftrast.numthreads = bound(1, numthreads, 64);
4976 #else
4977         dpsoftrast.numthreads = 1;
4978 #endif
4979         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
4980         for (i = 0; i < dpsoftrast.numthreads; i++)
4981         {
4982                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
4983                 thread->index = i;
4984                 thread->cullface = GL_BACK;
4985                 thread->colormask[1] = 1;
4986                 thread->colormask[2] = 1;
4987                 thread->colormask[3] = 1;
4988                 thread->blendfunc[0] = GL_ONE;
4989                 thread->blendfunc[1] = GL_ZERO;
4990                 thread->depthmask = true;
4991                 thread->depthtest = true;
4992                 thread->depthfunc = GL_LEQUAL;
4993                 thread->scissortest = false;
4994                 thread->alphatest = false;
4995                 thread->alphafunc = GL_GREATER;
4996                 thread->alphavalue = 0.5f;
4997                 thread->viewport[0] = 0;
4998                 thread->viewport[1] = 0;
4999                 thread->viewport[2] = dpsoftrast.fb_width;
5000                 thread->viewport[3] = dpsoftrast.fb_height;
5001                 thread->scissor[0] = 0;
5002                 thread->scissor[1] = 0;
5003                 thread->scissor[2] = dpsoftrast.fb_width;
5004                 thread->scissor[3] = dpsoftrast.fb_height;
5005                 thread->depthrange[0] = 0;
5006                 thread->depthrange[1] = 1;
5007                 thread->polygonoffset[0] = 0;
5008                 thread->polygonoffset[1] = 0;
5009         
5010                 if (dpsoftrast.interlace)
5011                 {
5012                         thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5013                         thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5014                         thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5015                         thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5016                 }
5017                 else
5018                 {
5019                         thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5020                         thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5021                 }
5022
5023                 thread->numspans = 0;
5024                 thread->numtriangles = 0;
5025                 thread->commandoffset = 0;
5026                 thread->waiting = false;
5027                 thread->starving = false;
5028 #ifdef USE_THREADS
5029                 thread->waitcond = SDL_CreateCond();
5030                 thread->drawcond = SDL_CreateCond();
5031                 thread->drawmutex = SDL_CreateMutex();
5032 #endif
5033
5034                 thread->validate = -1;
5035                 DPSOFTRAST_Validate(thread, -1);
5036 #ifdef USE_THREADS
5037                 thread->thread = SDL_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5038 #endif
5039         }
5040 }
5041
5042 void DPSOFTRAST_Shutdown(void)
5043 {
5044         int i;
5045 #ifdef USE_THREADS
5046         if (dpsoftrast.numthreads > 0)
5047         {
5048                 DPSOFTRAST_State_Thread *thread;
5049                 for (i = 0; i < dpsoftrast.numthreads; i++)
5050                 {
5051                         thread = &dpsoftrast.threads[i];
5052                         SDL_LockMutex(thread->drawmutex);
5053                         thread->index = -1;
5054                         SDL_CondSignal(thread->drawcond);
5055                         SDL_UnlockMutex(thread->drawmutex);
5056                         SDL_WaitThread(thread->thread, NULL);
5057                         SDL_DestroyCond(thread->waitcond);
5058                         SDL_DestroyCond(thread->drawcond);
5059                         SDL_DestroyMutex(thread->drawmutex);
5060                 }
5061         }
5062 #endif
5063         for (i = 0;i < dpsoftrast.texture_end;i++)
5064                 if (dpsoftrast.texture[i].bytes)
5065                         MM_FREE(dpsoftrast.texture[i].bytes);
5066         if (dpsoftrast.texture)
5067                 free(dpsoftrast.texture);
5068         if (dpsoftrast.threads)
5069                 MM_FREE(dpsoftrast.threads);
5070         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5071 }
5072