]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
don't crash on NULL texture upload in dpsoftrast
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifndef __cplusplus
10 typedef qboolean bool;
11 #endif
12
13 #define ALIGN_SIZE 16
14 #define ATOMIC_SIZE 32
15
16 #ifdef SSE2_PRESENT
17         #if defined(__APPLE__)
18                 #include <libkern/OSAtomic.h>
19                 #define ALIGN(var) var __attribute__((__aligned__(16)))
20                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
21                 #define MEMORY_BARRIER (_mm_sfence())
22                 #define ATOMIC_COUNTER volatile int32_t 
23                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
24                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
25                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
26         #elif defined(__GNUC__)
27                 #define ALIGN(var) var __attribute__((__aligned__(16)))
28                 #define ATOMIC(var) var __attribute__((__aligned__(32)))
29                 #define MEMORY_BARRIER (_mm_sfence())
30                 //(__sync_synchronize())
31                 #define ATOMIC_COUNTER volatile int
32                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
33                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
34                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
35         #elif defined(_MSC_VER)
36                 #define ALIGN(var) __declspec(align(16)) var
37                 #define ATOMIC(var) __declspec(align(32)) var
38                 #define MEMORY_BARRIER (_mm_sfence())
39                 //(MemoryBarrier())
40                 #define ATOMIC_COUNTER volatile LONG
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
44         #endif
45 #endif
46
47 #ifndef ALIGN
48 #define ALIGN(var) var
49 #endif
50 #ifndef ATOMIC
51 #define ATOMIC(var) var
52 #endif
53 #ifndef MEMORY_BARRIER
54 #define MEMORY_BARRIER ((void)0)
55 #endif
56 #ifndef ATOMIC_COUNTER
57 #define ATOMIC_COUNTER int
58 #endif
59 #ifndef ATOMIC_INCREMENT
60 #define ATOMIC_INCREMENT(counter) (++(counter))
61 #endif
62 #ifndef ATOMIC_DECREMENT
63 #define ATOMIC_DECREMENT(counter) (--(counter))
64 #endif
65 #ifndef ATOMIC_ADD
66 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
67 #endif
68
69 #ifdef SSE2_PRESENT
70 #include <emmintrin.h>
71
72 #define MM_MALLOC(size) _mm_malloc(size, ATOMIC_SIZE)
73
74 static void *MM_CALLOC(size_t nmemb, size_t size)
75 {
76         void *ptr = _mm_malloc(nmemb*size, ATOMIC_SIZE);
77         if (ptr != NULL) memset(ptr, 0, nmemb*size);
78         return ptr;
79 }
80
81 #define MM_FREE _mm_free
82 #else
83 #define MM_MALLOC(size) malloc(size)
84 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
85 #define MM_FREE free
86 #endif
87
88 typedef enum DPSOFTRAST_ARRAY_e
89 {
90         DPSOFTRAST_ARRAY_POSITION,
91         DPSOFTRAST_ARRAY_COLOR,
92         DPSOFTRAST_ARRAY_TEXCOORD0,
93         DPSOFTRAST_ARRAY_TEXCOORD1,
94         DPSOFTRAST_ARRAY_TEXCOORD2,
95         DPSOFTRAST_ARRAY_TEXCOORD3,
96         DPSOFTRAST_ARRAY_TEXCOORD4,
97         DPSOFTRAST_ARRAY_TEXCOORD5,
98         DPSOFTRAST_ARRAY_TEXCOORD6,
99         DPSOFTRAST_ARRAY_TEXCOORD7,
100         DPSOFTRAST_ARRAY_TOTAL
101 }
102 DPSOFTRAST_ARRAY;
103
104 typedef struct DPSOFTRAST_Texture_s
105 {
106         int flags;
107         int width;
108         int height;
109         int depth;
110         int sides;
111         DPSOFTRAST_TEXTURE_FILTER filter;
112         int mipmaps;
113         int size;
114         ATOMIC_COUNTER binds;
115         unsigned char *bytes;
116         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
117 }
118 DPSOFTRAST_Texture;
119
120 #define COMMAND_SIZE ALIGN_SIZE
121 #define COMMAND_ALIGN(var) ALIGN(var)
122
123 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
124 {
125         unsigned char opcode;
126         unsigned short commandsize;
127 }
128 DPSOFTRAST_Command);
129
130 enum { DPSOFTRAST_OPCODE_Reset = 0 };
131
132 #define DEFCOMMAND(opcodeval, name, fields) \
133         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
134         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
135         { \
136                 unsigned char opcode; \
137                 unsigned short commandsize; \
138                 fields \
139         } DPSOFTRAST_Command_##name );
140
141 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
142 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
143
144 typedef ATOMIC(struct DPSOFTRAST_State_Command_Pool_s
145 {
146         int freecommand;
147         int usedcommands;
148         ATOMIC(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
149 }
150 DPSOFTRAST_State_Command_Pool);
151
152 typedef ATOMIC(struct DPSOFTRAST_State_Triangle_s
153 {
154         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
155         float w[3];
156         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
157 }
158 DPSOFTRAST_State_Triangle);
159
160 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
161         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
162         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
163                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
164                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
165 }
166 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
167         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
168         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
169         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
170         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
171         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
172         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
173         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
174         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
175 }
176                                         
177 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
178
179 typedef ALIGN(struct DPSOFTRAST_State_Span_s
180 {
181         int triangle; // triangle this span was generated by
182         int x; // framebuffer x coord
183         int y; // framebuffer y coord
184         int startx; // usable range (according to pixelmask)
185         int endx; // usable range (according to pixelmask)
186         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
187 }
188 DPSOFTRAST_State_Span);
189
190 #define DPSOFTRAST_DRAW_MAXSPANS 1024
191 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
192
193 #define DPSOFTRAST_VALIDATE_FB 1
194 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
195 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
196 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
197
198 typedef enum DPSOFTRAST_BLENDMODE_e
199 {
200         DPSOFTRAST_BLENDMODE_OPAQUE,
201         DPSOFTRAST_BLENDMODE_ALPHA,
202         DPSOFTRAST_BLENDMODE_ADDALPHA,
203         DPSOFTRAST_BLENDMODE_ADD,
204         DPSOFTRAST_BLENDMODE_INVMOD,
205         DPSOFTRAST_BLENDMODE_MUL,
206         DPSOFTRAST_BLENDMODE_MUL2,
207         DPSOFTRAST_BLENDMODE_SUBALPHA,
208         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
209         DPSOFTRAST_BLENDMODE_INVADD,
210         DPSOFTRAST_BLENDMODE_TOTAL
211 }
212 DPSOFTRAST_BLENDMODE;
213
214 typedef ATOMIC(struct DPSOFTRAST_State_Thread_s
215 {
216         void *thread;
217         int index;
218         
219         int cullface;
220         int colormask[4];
221         int blendfunc[2];
222         int blendsubtract;
223         int depthmask;
224         int depthtest;
225         int depthfunc;
226         int scissortest;
227         int alphatest;
228         int alphafunc;
229         float alphavalue;
230         int viewport[4];
231         int scissor[4];
232         float depthrange[2];
233         float polygonoffset[2];
234
235         int shader_mode;
236         int shader_permutation;
237         int shader_exactspecularmath;
238
239         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
240         
241         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
242         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
243
244         // DPSOFTRAST_VALIDATE_ flags
245         int validate;
246
247         // derived values (DPSOFTRAST_VALIDATE_FB)
248         int fb_colormask;
249         int fb_scissor[4];
250         ALIGN(float fb_viewportcenter[4]);
251         ALIGN(float fb_viewportscale[4]);
252
253         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
254         int fb_depthfunc;
255
256         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
257         int fb_blendmode;
258
259         // band boundaries
260         int miny1;
261         int maxy1;
262         int miny2;
263         int maxy2;
264
265         ATOMIC(volatile int commandoffset);
266
267         volatile bool waiting;
268         volatile bool starving;
269         void *waitcond;
270         void *drawcond;
271         void *drawmutex;
272
273         int numspans;
274         int numtriangles;
275         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
276         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
277 }
278 DPSOFTRAST_State_Thread);
279
280 typedef ATOMIC(struct DPSOFTRAST_State_s
281 {
282         int fb_width;
283         int fb_height;
284         unsigned int *fb_depthpixels;
285         unsigned int *fb_colorpixels[4];
286
287         int viewport[4];
288         ALIGN(float fb_viewportcenter[4]);
289         ALIGN(float fb_viewportscale[4]);
290
291         float color[4];
292         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
293         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
294
295         const float *pointer_vertex3f;
296         const float *pointer_color4f;
297         const unsigned char *pointer_color4ub;
298         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
299         int stride_vertex;
300         int stride_color;
301         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
302         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
303         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
304
305         int firstvertex;
306         int numvertices;
307         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
308         float *screencoord4f;
309         int drawstarty;
310         int drawendy;
311         int drawclipped;
312         
313         int shader_mode;
314         int shader_permutation;
315         int shader_exactspecularmath;
316
317         int texture_max;
318         int texture_end;
319         int texture_firstfree;
320         DPSOFTRAST_Texture *texture;
321
322         int bigendian;
323
324         // error reporting
325         const char *errorstring;
326
327         bool usethreads;
328         int interlace;
329         int numthreads;
330         DPSOFTRAST_State_Thread *threads;
331
332         ATOMIC(volatile int drawcommand);
333
334         DPSOFTRAST_State_Command_Pool commandpool;
335 }
336 DPSOFTRAST_State);
337
338 DPSOFTRAST_State dpsoftrast;
339
340 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
341 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
342 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
343 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
344 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
345
346 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
347 {
348         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
349         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
350         fb_viewportcenter[3] = 0.5f;
351         fb_viewportcenter[0] = 0.0f;
352         fb_viewportscale[1] = 0.5f * viewport[2];
353         fb_viewportscale[2] = -0.5f * viewport[3];
354         fb_viewportscale[3] = 0.5f;
355         fb_viewportscale[0] = 1.0f;
356 }
357
358 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
359 {
360         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
361         // and viewport projection values
362         int x1, x2;
363         int y1, y2;
364         x1 = thread->scissor[0];
365         x2 = thread->scissor[0] + thread->scissor[2];
366         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
367         y2 = dpsoftrast.fb_height - thread->scissor[1];
368         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
369         if (x1 < 0) x1 = 0;
370         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
371         if (y1 < 0) y1 = 0;
372         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
373         thread->fb_scissor[0] = x1;
374         thread->fb_scissor[1] = y1;
375         thread->fb_scissor[2] = x2 - x1;
376         thread->fb_scissor[3] = y2 - y1;
377
378         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
379 }
380
381 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
382 {
383         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
384 }
385
386 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
387 {
388         if (thread->blendsubtract)
389         {
390                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
391                 {
392                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
393                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
394                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
395                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
396                 }
397         }
398         else
399         {       
400                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
401                 {
402                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
403                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
404                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
405                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
406                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
407                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
408                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
409                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
410                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
411                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
412                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
413                 }
414         }
415 }
416
417 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
418
419 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
420 {
421         mask &= thread->validate;
422         if (!mask)
423                 return;
424         if (mask & DPSOFTRAST_VALIDATE_FB)
425         {
426                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
427                 DPSOFTRAST_RecalcFB(thread);
428         }
429         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
430         {
431                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
432                 DPSOFTRAST_RecalcDepthFunc(thread);
433         }
434         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
435         {
436                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
437                 DPSOFTRAST_RecalcBlendFunc(thread);
438         }
439 }
440
441 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
442 {
443         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
444                 return &dpsoftrast.texture[index];
445         return NULL;
446 }
447
448 static void DPSOFTRAST_Texture_Grow(void)
449 {
450         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
451         DPSOFTRAST_State_Thread *thread;
452         int i;
453         int j;
454         DPSOFTRAST_Flush();
455         // expand texture array as needed
456         if (dpsoftrast.texture_max < 1024)
457                 dpsoftrast.texture_max = 1024;
458         else
459                 dpsoftrast.texture_max *= 2;
460         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
461         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
462                 if (dpsoftrast.texbound[i])
463                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
464         for (j = 0; j < dpsoftrast.numthreads; j++)
465         {
466                 thread = &dpsoftrast.threads[j];
467                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
468                         if (thread->texbound[i])
469                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
470         }
471 }
472
473 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
474 {
475         int w;
476         int h;
477         int d;
478         int size;
479         int s;
480         int texnum;
481         int mipmaps;
482         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
483         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
484         DPSOFTRAST_Texture *texture;
485         if (width*height*depth < 1)
486         {
487                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
488                 return 0;
489         }
490         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
491         {
492                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
493                 return 0;
494         }
495         switch(texformat)
496         {
497         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
498         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
499         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
500                 break;
501         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
502                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
503                 {
504                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
505                         return 0;
506                 }
507                 if (depth != 1)
508                 {
509                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
510                         return 0;
511                 }
512                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
513                 {
514                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
515                         return 0;
516                 }
517                 break;
518         }
519         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
520         {
521                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
522                 return 0;
523         }
524         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
525         {
526                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
527                 return 0;
528         }
529         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
530         {
531                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
532                 return 0;
533         }
534         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
535         {
536                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
537                 return 0;
538         }
539         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
542                 return 0;
543         }
544         // find first empty slot in texture array
545         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
546                 if (!dpsoftrast.texture[texnum].bytes)
547                         break;
548         dpsoftrast.texture_firstfree = texnum + 1;
549         if (dpsoftrast.texture_max <= texnum)
550                 DPSOFTRAST_Texture_Grow();
551         if (dpsoftrast.texture_end <= texnum)
552                 dpsoftrast.texture_end = texnum + 1;
553         texture = &dpsoftrast.texture[texnum];
554         memset(texture, 0, sizeof(*texture));
555         texture->flags = flags;
556         texture->width = width;
557         texture->height = height;
558         texture->depth = depth;
559         texture->sides = sides;
560         texture->binds = 0;
561         w = width;
562         h = height;
563         d = depth;
564         size = 0;
565         mipmaps = 0;
566         w = width;
567         h = height;
568         d = depth;
569         for (;;)
570         {
571                 s = w * h * d * sides * 4;
572                 texture->mipmap[mipmaps][0] = size;
573                 texture->mipmap[mipmaps][1] = s;
574                 texture->mipmap[mipmaps][2] = w;
575                 texture->mipmap[mipmaps][3] = h;
576                 texture->mipmap[mipmaps][4] = d;
577                 size += s;
578                 mipmaps++;
579                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
580                         break;
581                 if (w > 1) w >>= 1;
582                 if (h > 1) h >>= 1;
583                 if (d > 1) d >>= 1;
584         }
585         texture->mipmaps = mipmaps;
586         texture->size = size;
587
588         // allocate the pixels now
589         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
590
591         return texnum;
592 }
593 void DPSOFTRAST_Texture_Free(int index)
594 {
595         DPSOFTRAST_Texture *texture;
596         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
597         if (texture->binds)
598                 DPSOFTRAST_Flush();
599         if (texture->bytes)
600                 MM_FREE(texture->bytes);
601         texture->bytes = NULL;
602         memset(texture, 0, sizeof(*texture));
603         // adjust the free range and used range
604         if (dpsoftrast.texture_firstfree > index)
605                 dpsoftrast.texture_firstfree = index;
606         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
607                 dpsoftrast.texture_end--;
608 }
609 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
610 {
611         int i, x, y, z, w, layer0, layer1, row0, row1;
612         unsigned char *o, *i0, *i1, *i2, *i3;
613         DPSOFTRAST_Texture *texture;
614         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
615         if (texture->mipmaps <= 1)
616                 return;
617         for (i = 1;i < texture->mipmaps;i++)
618         {
619                 for (z = 0;z < texture->mipmap[i][4];z++)
620                 {
621                         layer0 = z*2;
622                         layer1 = z*2+1;
623                         if (layer1 >= texture->mipmap[i-1][4])
624                                 layer1 = texture->mipmap[i-1][4]-1;
625                         for (y = 0;y < texture->mipmap[i][3];y++)
626                         {
627                                 row0 = y*2;
628                                 row1 = y*2+1;
629                                 if (row1 >= texture->mipmap[i-1][3])
630                                         row1 = texture->mipmap[i-1][3]-1;
631                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
632                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
633                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
634                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
635                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
636                                 w = texture->mipmap[i][2];
637                                 if (layer1 > layer0)
638                                 {
639                                         if (texture->mipmap[i-1][2] > 1)
640                                         {
641                                                 // average 3D texture
642                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
643                                                 {
644                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
645                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
646                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
647                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
648                                                 }
649                                         }
650                                         else
651                                         {
652                                                 // average 3D mipmap with parent width == 1
653                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
654                                                 {
655                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
656                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
657                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
658                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
659                                                 }
660                                         }
661                                 }
662                                 else
663                                 {
664                                         if (texture->mipmap[i-1][2] > 1)
665                                         {
666                                                 // average 2D texture (common case)
667                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
668                                                 {
669                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
670                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
671                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
672                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
673                                                 }
674                                         }
675                                         else
676                                         {
677                                                 // 2D texture with parent width == 1
678                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
679                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
680                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
681                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
682                                         }
683                                 }
684                         }
685                 }
686         }
687 }
688 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
689 {
690         DPSOFTRAST_Texture *texture;
691         unsigned char *dst;
692         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
693         if (texture->binds)
694                 DPSOFTRAST_Flush();
695         if (pixels)
696         {
697                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
698                 while (blockheight > 0)
699                 {
700                         memcpy(dst, pixels, blockwidth * 4);
701                         pixels += blockwidth * 4;
702                         dst += texture->mipmap[0][2] * 4;
703                         blockheight--;
704                 }
705         }
706         DPSOFTRAST_Texture_CalculateMipmaps(index);
707 }
708 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
709 {
710         DPSOFTRAST_Texture *texture;
711         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
712         if (texture->binds)
713                 DPSOFTRAST_Flush();
714         if (pixels)
715                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
716         DPSOFTRAST_Texture_CalculateMipmaps(index);
717 }
718 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
719 {
720         DPSOFTRAST_Texture *texture;
721         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
722         return texture->mipmap[mip][2];
723 }
724 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
725 {
726         DPSOFTRAST_Texture *texture;
727         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
728         return texture->mipmap[mip][3];
729 }
730 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
731 {
732         DPSOFTRAST_Texture *texture;
733         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
734         return texture->mipmap[mip][4];
735 }
736 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
737 {
738         DPSOFTRAST_Texture *texture;
739         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
740         if (texture->binds)
741                 DPSOFTRAST_Flush();
742         return texture->bytes + texture->mipmap[mip][0];
743 }
744 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
745 {
746         DPSOFTRAST_Texture *texture;
747         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
748         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
749         {
750                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
751                 return;
752         }
753         if (texture->binds)
754                 DPSOFTRAST_Flush();
755         texture->filter = filter;
756 }
757
758 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
759 {
760         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
761                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
762                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
763                 DPSOFTRAST_Flush();
764         dpsoftrast.fb_width = width;
765         dpsoftrast.fb_height = height;
766         dpsoftrast.fb_depthpixels = depthpixels;
767         dpsoftrast.fb_colorpixels[0] = colorpixels0;
768         dpsoftrast.fb_colorpixels[1] = colorpixels1;
769         dpsoftrast.fb_colorpixels[2] = colorpixels2;
770         dpsoftrast.fb_colorpixels[3] = colorpixels3;
771 }
772
773 static void DPSOFTRAST_Draw_FlushThreads(void);
774
775 static void DPSOFTRAST_Draw_SyncCommands(void)
776 {
777         if(dpsoftrast.usethreads) MEMORY_BARRIER;
778         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
779 }
780
781 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
782 {
783         DPSOFTRAST_State_Thread *thread;
784         int i;
785         int freecommand = dpsoftrast.commandpool.freecommand;
786         int usedcommands = dpsoftrast.commandpool.usedcommands;
787         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
788                 return;
789         DPSOFTRAST_Draw_SyncCommands();
790         for(;;)
791         {
792                 int waitindex = -1;
793                 int commandoffset;
794                 usedcommands = 0;
795                 for (i = 0; i < dpsoftrast.numthreads; i++)
796                 {
797                         thread = &dpsoftrast.threads[i]; 
798                         commandoffset = freecommand - thread->commandoffset;
799                         if (commandoffset < 0)
800                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
801                         if (commandoffset > usedcommands)
802                         {
803                                 waitindex = i;
804                                 usedcommands = commandoffset;
805                         }
806                 }
807                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
808                         break;
809                 thread = &dpsoftrast.threads[waitindex];
810                 Thread_LockMutex(thread->drawmutex);
811                 if (thread->commandoffset != dpsoftrast.drawcommand)
812                 {
813                         thread->waiting = true;
814                         if (thread->starving) Thread_CondSignal(thread->drawcond);
815                         Thread_CondWait(thread->waitcond, thread->drawmutex);
816                         thread->waiting = false;
817                 }
818                 Thread_UnlockMutex(thread->drawmutex);
819         }
820         dpsoftrast.commandpool.usedcommands = usedcommands;
821 }
822
823 #define DPSOFTRAST_ALIGNCOMMAND(size) \
824         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
825 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
826         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
827
828 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
829 {
830         DPSOFTRAST_Command *command;
831         int freecommand = dpsoftrast.commandpool.freecommand;
832         int usedcommands = dpsoftrast.commandpool.usedcommands;
833         int extra = sizeof(DPSOFTRAST_Command);
834         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
835                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
836         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
837         {
838                 if (dpsoftrast.usethreads)
839                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
840                 else
841                         DPSOFTRAST_Draw_FlushThreads();
842                 freecommand = dpsoftrast.commandpool.freecommand;
843                 usedcommands = dpsoftrast.commandpool.usedcommands;
844         }
845         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
846         {
847                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
848                 command->opcode = DPSOFTRAST_OPCODE_Reset;
849                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
850                 freecommand = 0;
851         }
852         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
853         command->opcode = opcode;
854         command->commandsize = size;
855         freecommand += size;
856         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
857                 freecommand = 0;
858         dpsoftrast.commandpool.freecommand = freecommand;
859         dpsoftrast.commandpool.usedcommands = usedcommands + size;
860         return command;
861 }
862
863 static void DPSOFTRAST_UndoCommand(int size)
864 {
865         int freecommand = dpsoftrast.commandpool.freecommand;
866         int usedcommands = dpsoftrast.commandpool.usedcommands;
867         freecommand -= size;
868         if (freecommand < 0)
869                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
870         usedcommands -= size;
871         dpsoftrast.commandpool.freecommand = freecommand;
872         dpsoftrast.commandpool.usedcommands = usedcommands;
873 }
874                 
875 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
876 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
877 {
878         thread->viewport[0] = command->x;
879         thread->viewport[1] = command->y;
880         thread->viewport[2] = command->width;
881         thread->viewport[3] = command->height;
882         thread->validate |= DPSOFTRAST_VALIDATE_FB;
883 }
884 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
885 {
886         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
887         command->x = x;
888         command->y = y;
889         command->width = width;
890         command->height = height;
891
892         dpsoftrast.viewport[0] = x;
893         dpsoftrast.viewport[1] = y;
894         dpsoftrast.viewport[2] = width;
895         dpsoftrast.viewport[3] = height;
896         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
897 }
898
899 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
900 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
901 {
902         int i, x1, y1, x2, y2, w, h, x, y;
903         int miny1 = thread->miny1;
904         int maxy1 = thread->maxy1;
905         int miny2 = thread->miny2;
906         int maxy2 = thread->maxy2;
907         int bandy;
908         unsigned int *p;
909         unsigned int c;
910         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
911         x1 = thread->fb_scissor[0];
912         y1 = thread->fb_scissor[1];
913         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
914         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
915         if (y1 < miny1) y1 = miny1;
916         if (y2 > maxy2) y2 = maxy2;
917         w = x2 - x1;
918         h = y2 - y1;
919         if (w < 1 || h < 1)
920                 return;
921         // FIXME: honor fb_colormask?
922         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
923         for (i = 0;i < 4;i++)
924         {
925                 if (!dpsoftrast.fb_colorpixels[i])
926                         continue;
927                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
928                 for (;y < bandy;y++)
929                 {
930                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
931                         for (x = x1;x < x2;x++)
932                                 p[x] = c;
933                 }
934         }
935 }
936 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
937 {
938         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
939         command->r = r;
940         command->g = g;
941         command->b = b;
942         command->a = a;
943 }
944
945 DEFCOMMAND(3, ClearDepth, float depth;)
946 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
947 {
948         int x1, y1, x2, y2, w, h, x, y;
949         int miny1 = thread->miny1;
950         int maxy1 = thread->maxy1;
951         int miny2 = thread->miny2;
952         int maxy2 = thread->maxy2;
953         int bandy;
954         unsigned int *p;
955         unsigned int c;
956         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
957         x1 = thread->fb_scissor[0];
958         y1 = thread->fb_scissor[1];
959         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
960         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
961         if (y1 < miny1) y1 = miny1;
962         if (y2 > maxy2) y2 = maxy2;
963         w = x2 - x1;
964         h = y2 - y1;
965         if (w < 1 || h < 1)
966                 return;
967         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
968         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
969         for (;y < bandy;y++)
970         {
971                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
972                 for (x = x1;x < x2;x++)
973                         p[x] = c;
974         }
975 }
976 void DPSOFTRAST_ClearDepth(float d)
977 {
978         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
979         command->depth = d;
980 }
981
982 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
983 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
984 {
985         thread->colormask[0] = command->r != 0;
986         thread->colormask[1] = command->g != 0;
987         thread->colormask[2] = command->b != 0;
988         thread->colormask[3] = command->a != 0;
989         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
990 }
991 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
992 {
993         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
994         command->r = r;
995         command->g = g;
996         command->b = b;
997         command->a = a;
998 }
999
1000 DEFCOMMAND(5, DepthTest, int enable;)
1001 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1002 {
1003         thread->depthtest = command->enable;
1004         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1005 }
1006 void DPSOFTRAST_DepthTest(int enable)
1007 {
1008         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1009         command->enable = enable;
1010 }
1011
1012 DEFCOMMAND(6, ScissorTest, int enable;)
1013 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1014 {
1015         thread->scissortest = command->enable;
1016         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1017 }
1018 void DPSOFTRAST_ScissorTest(int enable)
1019 {
1020         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1021         command->enable = enable;
1022 }
1023
1024 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1025 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1026 {
1027         thread->scissor[0] = command->x;
1028         thread->scissor[1] = command->y;
1029         thread->scissor[2] = command->width;
1030         thread->scissor[3] = command->height;
1031         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1032 }
1033 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1034 {
1035         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1036         command->x = x;
1037         command->y = y;
1038         command->width = width;
1039         command->height = height;
1040 }
1041
1042 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1043 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1044 {
1045         thread->blendfunc[0] = command->sfactor;
1046         thread->blendfunc[1] = command->dfactor;
1047         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1048 }
1049 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1050 {
1051         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1052         command->sfactor = sfactor;
1053         command->dfactor = dfactor;
1054 }
1055
1056 DEFCOMMAND(9, BlendSubtract, int enable;)
1057 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1058 {
1059         thread->blendsubtract = command->enable;
1060         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1061 }
1062 void DPSOFTRAST_BlendSubtract(int enable)
1063 {
1064         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1065         command->enable = enable;
1066 }
1067
1068 DEFCOMMAND(10, DepthMask, int enable;)
1069 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1070 {
1071         thread->depthmask = command->enable;
1072 }
1073 void DPSOFTRAST_DepthMask(int enable)
1074 {
1075         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1076         command->enable = enable;
1077 }
1078
1079 DEFCOMMAND(11, DepthFunc, int func;)
1080 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1081 {
1082         thread->depthfunc = command->func;
1083 }
1084 void DPSOFTRAST_DepthFunc(int func)
1085 {
1086         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1087         command->func = func;
1088 }
1089
1090 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1091 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1092 {
1093         thread->depthrange[0] = command->nearval;
1094         thread->depthrange[1] = command->farval;
1095 }
1096 void DPSOFTRAST_DepthRange(float nearval, float farval)
1097 {
1098         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1099         command->nearval = nearval;
1100         command->farval = farval;
1101 }
1102
1103 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1104 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1105 {
1106         thread->polygonoffset[0] = command->alongnormal;
1107         thread->polygonoffset[1] = command->intoview;
1108 }
1109 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1110 {
1111         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1112         command->alongnormal = alongnormal;
1113         command->intoview = intoview;
1114 }
1115
1116 DEFCOMMAND(14, CullFace, int mode;)
1117 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1118 {
1119         thread->cullface = command->mode;
1120 }
1121 void DPSOFTRAST_CullFace(int mode)
1122 {
1123         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1124         command->mode = mode;
1125 }
1126
1127 DEFCOMMAND(15, AlphaTest, int enable;)
1128 static void DPSOFTRAST_Interpret_AlphaTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaTest *command)
1129 {
1130         thread->alphatest = command->enable;
1131 }
1132 void DPSOFTRAST_AlphaTest(int enable)
1133 {
1134         DPSOFTRAST_Command_AlphaTest *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaTest);
1135         command->enable = enable;
1136 }
1137
1138 DEFCOMMAND(16, AlphaFunc, int func; float ref;)
1139 static void DPSOFTRAST_Interpret_AlphaFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_AlphaFunc *command)
1140 {
1141         thread->alphafunc = command->func;
1142         thread->alphavalue = command->ref;
1143 }
1144 void DPSOFTRAST_AlphaFunc(int func, float ref)
1145 {
1146         DPSOFTRAST_Command_AlphaFunc *command = DPSOFTRAST_ALLOCATECOMMAND(AlphaFunc);
1147         command->func = func;
1148         command->ref = ref;
1149 }
1150
1151 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1152 {
1153         dpsoftrast.color[0] = r;
1154         dpsoftrast.color[1] = g;
1155         dpsoftrast.color[2] = b;
1156         dpsoftrast.color[3] = a;
1157 }
1158
1159 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1160 {
1161         int outstride = blockwidth * 4;
1162         int instride = dpsoftrast.fb_width * 4;
1163         int bx1 = blockx;
1164         int by1 = blocky;
1165         int bx2 = blockx + blockwidth;
1166         int by2 = blocky + blockheight;
1167         int bw;
1168         int x;
1169         int y;
1170         unsigned char *inpixels;
1171         unsigned char *b;
1172         unsigned char *o;
1173         DPSOFTRAST_Flush();
1174         if (bx1 < 0) bx1 = 0;
1175         if (by1 < 0) by1 = 0;
1176         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1177         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1178         bw = bx2 - bx1;
1179         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1180         if (dpsoftrast.bigendian)
1181         {
1182                 for (y = by1;y < by2;y++)
1183                 {
1184                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1185                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1186                         for (x = bx1;x < bx2;x++)
1187                         {
1188                                 o[0] = b[3];
1189                                 o[1] = b[2];
1190                                 o[2] = b[1];
1191                                 o[3] = b[0];
1192                                 o += 4;
1193                                 b += 4;
1194                         }
1195                 }
1196         }
1197         else
1198         {
1199                 for (y = by1;y < by2;y++)
1200                 {
1201                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1202                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1203                         memcpy(o, b, bw*4);
1204                 }
1205         }
1206
1207 }
1208 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1209 {
1210         int tx1 = tx;
1211         int ty1 = ty;
1212         int tx2 = tx + width;
1213         int ty2 = ty + height;
1214         int sx1 = sx;
1215         int sy1 = sy;
1216         int sx2 = sx + width;
1217         int sy2 = sy + height;
1218         int swidth;
1219         int sheight;
1220         int twidth;
1221         int theight;
1222         int sw;
1223         int sh;
1224         int tw;
1225         int th;
1226         int y;
1227         unsigned int *spixels;
1228         unsigned int *tpixels;
1229         DPSOFTRAST_Texture *texture;
1230         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1231         if (mip < 0 || mip >= texture->mipmaps) return;
1232         DPSOFTRAST_Flush();
1233         spixels = dpsoftrast.fb_colorpixels[0];
1234         swidth = dpsoftrast.fb_width;
1235         sheight = dpsoftrast.fb_height;
1236         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1237         twidth = texture->mipmap[mip][2];
1238         theight = texture->mipmap[mip][3];
1239         if (tx1 < 0) tx1 = 0;
1240         if (ty1 < 0) ty1 = 0;
1241         if (tx2 > twidth) tx2 = twidth;
1242         if (ty2 > theight) ty2 = theight;
1243         if (sx1 < 0) sx1 = 0;
1244         if (sy1 < 0) sy1 = 0;
1245         if (sx2 > swidth) sx2 = swidth;
1246         if (sy2 > sheight) sy2 = sheight;
1247         tw = tx2 - tx1;
1248         th = ty2 - ty1;
1249         sw = sx2 - sx1;
1250         sh = sy2 - sy1;
1251         if (tw > sw) tw = sw;
1252         if (th > sh) th = sh;
1253         if (tw < 1 || th < 1)
1254                 return;
1255         sy1 = sheight - 1 - sy1;
1256         for (y = 0;y < th;y++)
1257                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1258         if (texture->mipmaps > 1)
1259                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1260 }
1261
1262 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1263 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1264 {
1265         if (thread->texbound[command->unitnum])
1266                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1267         thread->texbound[command->unitnum] = command->texture;
1268 }
1269 void DPSOFTRAST_SetTexture(int unitnum, int index)
1270 {
1271         DPSOFTRAST_Command_SetTexture *command;
1272         DPSOFTRAST_Texture *texture;
1273         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1274         {
1275                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1276                 return;
1277         }
1278         texture = DPSOFTRAST_Texture_GetByIndex(index);
1279         if (index && !texture)
1280         {
1281                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1282                 return;
1283         }
1284
1285         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1286         command->unitnum = unitnum;
1287         command->texture = texture;
1288
1289         dpsoftrast.texbound[unitnum] = texture;
1290         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1291 }
1292
1293 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1294 {
1295         dpsoftrast.pointer_vertex3f = vertex3f;
1296         dpsoftrast.stride_vertex = stride;
1297 }
1298 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1299 {
1300         dpsoftrast.pointer_color4f = color4f;
1301         dpsoftrast.pointer_color4ub = NULL;
1302         dpsoftrast.stride_color = stride;
1303 }
1304 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1305 {
1306         dpsoftrast.pointer_color4f = NULL;
1307         dpsoftrast.pointer_color4ub = color4ub;
1308         dpsoftrast.stride_color = stride;
1309 }
1310 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1311 {
1312         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1313         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1314         dpsoftrast.stride_texcoord[unitnum] = stride;
1315 }
1316
1317 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1318 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1319 {
1320         thread->shader_mode = command->mode;
1321         thread->shader_permutation = command->permutation;
1322         thread->shader_exactspecularmath = command->exactspecularmath;
1323 }
1324 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1325 {
1326         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1327         command->mode = mode;
1328         command->permutation = permutation;
1329         command->exactspecularmath = exactspecularmath;
1330
1331         dpsoftrast.shader_mode = mode;
1332         dpsoftrast.shader_permutation = permutation;
1333         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1334 }
1335
1336 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1337 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1338 {
1339         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1340 }
1341 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1342 {
1343         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1344         command->index = index;
1345         command->val[0] = v0;
1346         command->val[1] = v1;
1347         command->val[2] = v2;
1348         command->val[3] = v3;
1349
1350         dpsoftrast.uniform4f[index*4+0] = v0;
1351         dpsoftrast.uniform4f[index*4+1] = v1;
1352         dpsoftrast.uniform4f[index*4+2] = v2;
1353         dpsoftrast.uniform4f[index*4+3] = v3;
1354 }
1355 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1356 {
1357         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1358         command->index = index;
1359         memcpy(command->val, v, sizeof(command->val));
1360
1361         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1362 }
1363
1364 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1365 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1366 {
1367         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1368 }
1369 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1370 {
1371 #ifdef SSE2_PRESENT
1372         int i, index;
1373         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1374         {
1375                 __m128 m0, m1, m2, m3;
1376                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1377                 command->index = (DPSOFTRAST_UNIFORM)index;
1378                 if (((size_t)v)&(ALIGN_SIZE-1))
1379                 {
1380                         m0 = _mm_loadu_ps(v);
1381                         m1 = _mm_loadu_ps(v+4);
1382                         m2 = _mm_loadu_ps(v+8);
1383                         m3 = _mm_loadu_ps(v+12);
1384                 }
1385                 else
1386                 {
1387                         m0 = _mm_load_ps(v);
1388                         m1 = _mm_load_ps(v+4);
1389                         m2 = _mm_load_ps(v+8);
1390                         m3 = _mm_load_ps(v+12);
1391                 }
1392                 if (transpose)
1393                 {
1394                         __m128 t0, t1, t2, t3;
1395                         t0 = _mm_unpacklo_ps(m0, m1);
1396                         t1 = _mm_unpacklo_ps(m2, m3);
1397                         t2 = _mm_unpackhi_ps(m0, m1);
1398                         t3 = _mm_unpackhi_ps(m2, m3);
1399                         m0 = _mm_movelh_ps(t0, t1);
1400                         m1 = _mm_movehl_ps(t1, t0);
1401                         m2 = _mm_movelh_ps(t2, t3);
1402                         m3 = _mm_movehl_ps(t3, t2);                     
1403                 }
1404                 _mm_store_ps(command->val, m0);
1405                 _mm_store_ps(command->val+4, m1);
1406                 _mm_store_ps(command->val+8, m2);
1407                 _mm_store_ps(command->val+12, m3);
1408                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1409                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1410                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1411                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1412         }
1413 #endif
1414 }
1415
1416 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1417 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1418 {
1419         thread->uniform1i[command->index] = command->val;
1420 }
1421 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1422 {
1423         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1424         command->index = index;
1425         command->val = i0;
1426
1427         dpsoftrast.uniform1i[command->index] = i0;
1428 }
1429
1430 #ifdef SSE2_PRESENT
1431 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1432 {
1433         float *end = dst + size*4;
1434         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1435         {
1436                 while (dst < end)
1437                 {
1438                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1439                         dst += 4;
1440                         src += stride;
1441                 }
1442         }
1443         else
1444         {
1445                 while (dst < end)
1446                 {
1447                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1448                         dst += 4;
1449                         src += stride;
1450                 }
1451         }
1452 }
1453
1454 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1455 {
1456         float *end = dst + size*4;
1457         if (stride == sizeof(float[3]))
1458         {
1459                 float *end4 = dst + (size&~3)*4;        
1460                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1461                 {
1462                         while (dst < end4)
1463                         {
1464                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1465                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1466                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1467                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1468                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1469                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1470                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1471                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1472                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1473                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1474                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1475                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1476                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1477                                 dst += 16;
1478                                 src += 4*sizeof(float[3]);
1479                         }
1480                 }
1481                 else
1482                 {
1483                         while (dst < end4)
1484                         {
1485                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1486                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1487                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1488                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1489                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1490                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1491                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1492                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1493                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1494                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1495                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1496                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1497                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1498                                 dst += 16;
1499                                 src += 4*sizeof(float[3]);
1500                         }
1501                 }
1502         }
1503         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1504         {
1505                 while (dst < end)
1506                 {
1507                         __m128 v = _mm_loadu_ps((const float *)src);
1508                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1509                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1510                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1511                         _mm_store_ps(dst, v);
1512                         dst += 4;
1513                         src += stride;
1514                 }
1515         }
1516         else
1517         {
1518                 while (dst < end)
1519                 {
1520                         __m128 v = _mm_load_ps((const float *)src);
1521                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1522                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1523                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1524                         _mm_store_ps(dst, v);
1525                         dst += 4;
1526                         src += stride;
1527                 }
1528         }
1529 }
1530
1531 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1532 {
1533         float *end = dst + size*4;
1534         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1535         if (stride == sizeof(float[2]))
1536         {
1537                 float *end2 = dst + (size&~1)*4;
1538                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1539                 {
1540                         while (dst < end2)
1541                         {
1542                                 __m128 v = _mm_loadu_ps((const float *)src);
1543                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1544                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1545                                 dst += 8;
1546                                 src += 2*sizeof(float[2]);
1547                         }
1548                 }
1549                 else
1550                 {
1551                         while (dst < end2)
1552                         {
1553                                 __m128 v = _mm_load_ps((const float *)src);
1554                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1555                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1556                                 dst += 8;
1557                                 src += 2*sizeof(float[2]);
1558                         }
1559                 }
1560         }
1561         while (dst < end)
1562         {
1563                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1564                 dst += 4;
1565                 src += stride;
1566         }
1567 }
1568
1569 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1570 {
1571         float *end = dst + size*4;
1572         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1573         if (stride == sizeof(unsigned char[4]))
1574         {
1575                 float *end4 = dst + (size&~3)*4;
1576                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1577                 {
1578                         while (dst < end4)
1579                         {
1580                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1581                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1582                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1583                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1584                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1585                                 dst += 16;
1586                                 src += 4*sizeof(unsigned char[4]);
1587                         }
1588                 }
1589                 else
1590                 {
1591                         while (dst < end4)
1592                         {
1593                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1594                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1595                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1596                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1597                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1598                                 dst += 16;
1599                                 src += 4*sizeof(unsigned char[4]);
1600                         }
1601                 }
1602         }
1603         while (dst < end)
1604         {
1605                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1606                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1607                 dst += 4;
1608                 src += stride;
1609         }
1610 }
1611
1612 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1613 {
1614         float *end = dst + 4*size;
1615         __m128 v = _mm_loadu_ps(src);
1616         while (dst < end)
1617         {
1618                 _mm_store_ps(dst, v);
1619                 dst += 4;
1620         }
1621 }
1622 #endif
1623
1624 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1625 {
1626 #ifdef SSE2_PRESENT
1627         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1628         __m128 m0, m1, m2, m3;
1629         float *end;
1630         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1631         {
1632                 // fast case for identity matrix
1633                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1634                 return;
1635         }
1636         end = out4f + numitems*4;
1637         m0 = _mm_loadu_ps(inmatrix16f);
1638         m1 = _mm_loadu_ps(inmatrix16f + 4);
1639         m2 = _mm_loadu_ps(inmatrix16f + 8);
1640         m3 = _mm_loadu_ps(inmatrix16f + 12);
1641         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1642         {
1643                 while (out4f < end)
1644                 {
1645                         __m128 v = _mm_loadu_ps(in4f);
1646                         _mm_store_ps(out4f,
1647                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1648                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1649                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1650                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1651                         out4f += 4;
1652                         in4f += 4;
1653                 }
1654         }
1655         else
1656         {
1657                 while (out4f < end)
1658                 {
1659                         __m128 v = _mm_load_ps(in4f);
1660                         _mm_store_ps(out4f,
1661                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1662                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1663                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1664                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1665                         out4f += 4;
1666                         in4f += 4;
1667                 }
1668         }
1669 #endif
1670 }
1671
1672 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1673 {
1674         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1675 }
1676
1677 #ifdef SSE2_PRESENT
1678 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1679 { \
1680         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1681         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1682         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1683         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1684 }
1685
1686 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1687 { \
1688         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1689         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1690         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1691         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1692 }
1693
1694 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1695 { \
1696         __m128 p = (in); \
1697         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1698                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1699                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1700                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1701 }
1702
1703 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, __m128 minpos, __m128 maxpos, __m128 viewportcenter, __m128 viewportscale, __m128 m0, __m128 m1, __m128 m2, __m128 m3)
1704 {
1705         int clipmask = 0xFF;
1706         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1707         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1708         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1709         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1710         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1711         #define BBFRONT(k, pos) \
1712         { \
1713                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1714                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1715                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1716                 { \
1717                         __m128 proj; \
1718                         clipmask &= ~(1<<k); \
1719                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1720                         minproj = _mm_min_ss(minproj, proj); \
1721                         maxproj = _mm_max_ss(maxproj, proj); \
1722                 } \
1723         }
1724         BBFRONT(0, minpos); 
1725         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1726         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1727         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1728         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1729         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1730         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1731         BBFRONT(7, maxpos);
1732         #define BBCLIP(k) \
1733         { \
1734                 if (clipmask&(1<<k)) \
1735                 { \
1736                         if (!(clipmask&(1<<(k^1)))) \
1737                         { \
1738                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1739                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1740                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1741                                 minproj = _mm_min_ss(minproj, proj); \
1742                                 maxproj = _mm_max_ss(maxproj, proj); \
1743                         } \
1744                         if (!(clipmask&(1<<(k^2)))) \
1745                         { \
1746                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1747                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1748                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1749                                 minproj = _mm_min_ss(minproj, proj); \
1750                                 maxproj = _mm_max_ss(maxproj, proj); \
1751                         } \
1752                         if (!(clipmask&(1<<(k^4)))) \
1753                         { \
1754                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1755                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1756                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1757                                 minproj = _mm_min_ss(minproj, proj); \
1758                                 maxproj = _mm_max_ss(maxproj, proj); \
1759                         } \
1760                 } \
1761         }
1762         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1763         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1764         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1765         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1766         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1767         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1768         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1769         *starty = _mm_cvttss_si32(maxproj);
1770         *endy = _mm_cvttss_si32(minproj)+1;
1771         return clipmask;
1772 }
1773         
1774 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1775 {
1776         float *end = out4f + numitems*4;
1777         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1778         __m128 minpos, maxpos;
1779         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1780         {
1781                 minpos = maxpos = _mm_loadu_ps(in4f);
1782                 while (out4f < end)
1783                 {
1784                         __m128 v = _mm_loadu_ps(in4f);
1785                         minpos = _mm_min_ps(minpos, v);
1786                         maxpos = _mm_max_ps(maxpos, v);
1787                         _mm_store_ps(out4f, v);
1788                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1789                         _mm_store_ps(screen4f, v);
1790                         in4f += 4;
1791                         out4f += 4;
1792                         screen4f += 4;
1793                 }
1794         }
1795         else
1796         {
1797                 minpos = maxpos = _mm_load_ps(in4f);
1798                 while (out4f < end)
1799                 {
1800                         __m128 v = _mm_load_ps(in4f);
1801                         minpos = _mm_min_ps(minpos, v);
1802                         maxpos = _mm_max_ps(maxpos, v);
1803                         _mm_store_ps(out4f, v);
1804                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1805                         _mm_store_ps(screen4f, v);
1806                         in4f += 4;
1807                         out4f += 4;
1808                         screen4f += 4;
1809                 }
1810         }
1811         if (starty && endy) 
1812                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, 
1813                                         _mm_setr_ps(1.0f, 0.0f, 0.0f, 0.0f),
1814                                         _mm_setr_ps(0.0f, 1.0f, 0.0f, 0.0f),
1815                                         _mm_setr_ps(0.0f, 0.0f, 1.0f, 0.0f),
1816                                         _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f));
1817         return 0;
1818 }
1819
1820 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1821 {
1822         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1823         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1824         float *end;
1825         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1826                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1827         end = out4f + numitems*4;
1828         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1829         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1830         m0 = _mm_loadu_ps(inmatrix16f);
1831         m1 = _mm_loadu_ps(inmatrix16f + 4);
1832         m2 = _mm_loadu_ps(inmatrix16f + 8);
1833         m3 = _mm_loadu_ps(inmatrix16f + 12);
1834         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1835         {
1836                 minpos = maxpos = _mm_loadu_ps(in4f);
1837                 while (out4f < end)
1838                 {
1839                         __m128 v = _mm_loadu_ps(in4f);
1840                         minpos = _mm_min_ps(minpos, v);
1841                         maxpos = _mm_max_ps(maxpos, v);
1842                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1843                         _mm_store_ps(out4f, v);
1844                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1845                         _mm_store_ps(screen4f, v);
1846                         in4f += 4;
1847                         out4f += 4;
1848                         screen4f += 4;
1849                 }
1850         }
1851         else
1852         {
1853                 minpos = maxpos = _mm_load_ps(in4f);
1854                 while (out4f < end)
1855                 {
1856                         __m128 v = _mm_load_ps(in4f);
1857                         minpos = _mm_min_ps(minpos, v);
1858                         maxpos = _mm_max_ps(maxpos, v);
1859                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1860                         _mm_store_ps(out4f, v);
1861                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1862                         _mm_store_ps(screen4f, v);
1863                         in4f += 4;
1864                         out4f += 4;
1865                         screen4f += 4;
1866                 }
1867         }
1868         if (starty && endy) 
1869                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minpos, maxpos, viewportcenter, viewportscale, m0, m1, m2, m3); 
1870         return 0;
1871 }
1872 #endif
1873
1874 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1875 {
1876 #ifdef SSE2_PRESENT
1877         float *outf = dpsoftrast.post_array4f[outarray];
1878         const unsigned char *inb;
1879         int firstvertex = dpsoftrast.firstvertex;
1880         int numvertices = dpsoftrast.numvertices;
1881         int stride;
1882         switch(inarray)
1883         {
1884         case DPSOFTRAST_ARRAY_POSITION:
1885                 stride = dpsoftrast.stride_vertex;
1886                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1887                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1888                 break;
1889         case DPSOFTRAST_ARRAY_COLOR:
1890                 stride = dpsoftrast.stride_color;
1891                 if (dpsoftrast.pointer_color4f)
1892                 {
1893                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1894                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1895                 }
1896                 else if (dpsoftrast.pointer_color4ub)
1897                 {
1898                         stride = dpsoftrast.stride_color;
1899                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1900                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1901                 }
1902                 else
1903                 {
1904                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1905                 }
1906                 break;
1907         default:
1908                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1909                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1910                 {
1911                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1912                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1913                         {
1914                         case 2:
1915                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1916                                 break;
1917                         case 3:
1918                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1919                                 break;
1920                         case 4:
1921                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1922                                 break;
1923                         }
1924                 }
1925                 break;
1926         }
1927         return outf;
1928 #else
1929         return NULL;
1930 #endif
1931 }
1932
1933 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1934 {
1935         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1936         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1937         return data;
1938 }
1939
1940 #if 0
1941 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1942 {
1943 #ifdef SSE2_PRESENT
1944         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1945         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1946         return data;
1947 #else
1948         return NULL;
1949 #endif
1950 }
1951 #endif
1952
1953 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1954 {
1955 #ifdef SSE2_PRESENT
1956         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1957         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
1958         return data;
1959 #else
1960         return NULL;
1961 #endif
1962 }
1963
1964 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
1965 {
1966         int x;
1967         int startx = span->startx;
1968         int endx = span->endx;
1969         float wslope = triangle->w[0];
1970         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
1971         float endz = 1.0f / (w + wslope * startx);
1972         for (x = startx;x < endx;)
1973         {
1974                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
1975                 float z = endz, dz;
1976                 if (nextsub >= endx) nextsub = endsub = endx-1;
1977                 endz = 1.0f / (w + wslope * nextsub);
1978                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
1979                 for (; x <= endsub; x++, z += dz)
1980                         zf[x] = z;
1981         }
1982 }
1983
1984 void DPSOFTRAST_Draw_Span_Finish(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const float * RESTRICT in4f)
1985 {
1986         int x;
1987         int startx = span->startx;
1988         int endx = span->endx;
1989         int d[4];
1990         float a, b;
1991         unsigned char * RESTRICT pixelmask = span->pixelmask;
1992         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1993         if (!pixel)
1994                 return;
1995         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
1996         // handle alphatest now (this affects depth writes too)
1997         if (thread->alphatest)
1998                 for (x = startx;x < endx;x++)
1999                         if (in4f[x*4+3] < 0.5f)
2000                                 pixelmask[x] = false;
2001         // FIXME: this does not handle bigendian
2002         switch(thread->fb_blendmode)
2003         {
2004         case DPSOFTRAST_BLENDMODE_OPAQUE:
2005                 for (x = startx;x < endx;x++)
2006                 {
2007                         if (!pixelmask[x])
2008                                 continue;
2009                         d[0] = (int)(in4f[x*4+2]*255.0f);if (d[0] > 255) d[0] = 255;
2010                         d[1] = (int)(in4f[x*4+1]*255.0f);if (d[1] > 255) d[1] = 255;
2011                         d[2] = (int)(in4f[x*4+0]*255.0f);if (d[2] > 255) d[2] = 255;
2012                         d[3] = (int)(in4f[x*4+3]*255.0f);if (d[3] > 255) d[3] = 255;
2013                         pixel[x*4+0] = d[0];
2014                         pixel[x*4+1] = d[1];
2015                         pixel[x*4+2] = d[2];
2016                         pixel[x*4+3] = d[3];
2017                 }
2018                 break;
2019         case DPSOFTRAST_BLENDMODE_ALPHA:
2020                 for (x = startx;x < endx;x++)
2021                 {
2022                         if (!pixelmask[x])
2023                                 continue;
2024                         a = in4f[x*4+3] * 255.0f;
2025                         b = 1.0f - in4f[x*4+3];
2026                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2027                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2028                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2029                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2030                         pixel[x*4+0] = d[0];
2031                         pixel[x*4+1] = d[1];
2032                         pixel[x*4+2] = d[2];
2033                         pixel[x*4+3] = d[3];
2034                 }
2035                 break;
2036         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2037                 for (x = startx;x < endx;x++)
2038                 {
2039                         if (!pixelmask[x])
2040                                 continue;
2041                         a = in4f[x*4+3] * 255.0f;
2042                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2043                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2044                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2045                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2046                         pixel[x*4+0] = d[0];
2047                         pixel[x*4+1] = d[1];
2048                         pixel[x*4+2] = d[2];
2049                         pixel[x*4+3] = d[3];
2050                 }
2051                 break;
2052         case DPSOFTRAST_BLENDMODE_ADD:
2053                 for (x = startx;x < endx;x++)
2054                 {
2055                         if (!pixelmask[x])
2056                                 continue;
2057                         d[0] = (int)(in4f[x*4+2]*255.0f+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2058                         d[1] = (int)(in4f[x*4+1]*255.0f+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2059                         d[2] = (int)(in4f[x*4+0]*255.0f+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2060                         d[3] = (int)(in4f[x*4+3]*255.0f+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2061                         pixel[x*4+0] = d[0];
2062                         pixel[x*4+1] = d[1];
2063                         pixel[x*4+2] = d[2];
2064                         pixel[x*4+3] = d[3];
2065                 }
2066                 break;
2067         case DPSOFTRAST_BLENDMODE_INVMOD:
2068                 for (x = startx;x < endx;x++)
2069                 {
2070                         if (!pixelmask[x])
2071                                 continue;
2072                         d[0] = (int)((1.0f-in4f[x*4+2])*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2073                         d[1] = (int)((1.0f-in4f[x*4+1])*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2074                         d[2] = (int)((1.0f-in4f[x*4+0])*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2075                         d[3] = (int)((1.0f-in4f[x*4+3])*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2076                         pixel[x*4+0] = d[0];
2077                         pixel[x*4+1] = d[1];
2078                         pixel[x*4+2] = d[2];
2079                         pixel[x*4+3] = d[3];
2080                 }
2081                 break;
2082         case DPSOFTRAST_BLENDMODE_MUL:
2083                 for (x = startx;x < endx;x++)
2084                 {
2085                         if (!pixelmask[x])
2086                                 continue;
2087                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]);if (d[0] > 255) d[0] = 255;
2088                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2089                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]);if (d[2] > 255) d[2] = 255;
2090                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2091                         pixel[x*4+0] = d[0];
2092                         pixel[x*4+1] = d[1];
2093                         pixel[x*4+2] = d[2];
2094                         pixel[x*4+3] = d[3];
2095                 }
2096                 break;
2097         case DPSOFTRAST_BLENDMODE_MUL2:
2098                 for (x = startx;x < endx;x++)
2099                 {
2100                         if (!pixelmask[x])
2101                                 continue;
2102                         d[0] = (int)(in4f[x*4+2]*pixel[x*4+0]*2.0f);if (d[0] > 255) d[0] = 255;
2103                         d[1] = (int)(in4f[x*4+1]*pixel[x*4+1]*2.0f);if (d[1] > 255) d[1] = 255;
2104                         d[2] = (int)(in4f[x*4+0]*pixel[x*4+2]*2.0f);if (d[2] > 255) d[2] = 255;
2105                         d[3] = (int)(in4f[x*4+3]*pixel[x*4+3]*2.0f);if (d[3] > 255) d[3] = 255;
2106                         pixel[x*4+0] = d[0];
2107                         pixel[x*4+1] = d[1];
2108                         pixel[x*4+2] = d[2];
2109                         pixel[x*4+3] = d[3];
2110                 }
2111                 break;
2112         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2113                 for (x = startx;x < endx;x++)
2114                 {
2115                         if (!pixelmask[x])
2116                                 continue;
2117                         a = in4f[x*4+3] * -255.0f;
2118                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]);if (d[0] > 255) d[0] = 255;if (d[0] < 0) d[0] = 0;
2119                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]);if (d[1] > 255) d[1] = 255;if (d[1] < 0) d[1] = 0;
2120                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]);if (d[2] > 255) d[2] = 255;if (d[2] < 0) d[2] = 0;
2121                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]);if (d[3] > 255) d[3] = 255;if (d[3] < 0) d[3] = 0;
2122                         pixel[x*4+0] = d[0];
2123                         pixel[x*4+1] = d[1];
2124                         pixel[x*4+2] = d[2];
2125                         pixel[x*4+3] = d[3];
2126                 }
2127                 break;
2128         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2129                 for (x = startx;x < endx;x++)
2130                 {
2131                         if (!pixelmask[x])
2132                                 continue;
2133                         a = 255.0f;
2134                         b = 1.0f - in4f[x*4+3];
2135                         d[0] = (int)(in4f[x*4+2]*a+pixel[x*4+0]*b);if (d[0] > 255) d[0] = 255;
2136                         d[1] = (int)(in4f[x*4+1]*a+pixel[x*4+1]*b);if (d[1] > 255) d[1] = 255;
2137                         d[2] = (int)(in4f[x*4+0]*a+pixel[x*4+2]*b);if (d[2] > 255) d[2] = 255;
2138                         d[3] = (int)(in4f[x*4+3]*a+pixel[x*4+3]*b);if (d[3] > 255) d[3] = 255;
2139                         pixel[x*4+0] = d[0];
2140                         pixel[x*4+1] = d[1];
2141                         pixel[x*4+2] = d[2];
2142                         pixel[x*4+3] = d[3];
2143                 }
2144                 break;
2145         case DPSOFTRAST_BLENDMODE_INVADD:
2146                 for (x = startx;x < endx;x++)
2147                 {
2148                         if (!pixelmask[x])
2149                                 continue;
2150                         d[0] = (int)((255.0f-pixel[x*4+2])*in4f[x*4+0] + pixel[x*4+2]);if (d[0] > 255) d[0] = 255;
2151                         d[1] = (int)((255.0f-pixel[x*4+1])*in4f[x*4+1] + pixel[x*4+1]);if (d[1] > 255) d[1] = 255;
2152                         d[2] = (int)((255.0f-pixel[x*4+0])*in4f[x*4+2] + pixel[x*4+0]);if (d[2] > 255) d[2] = 255;
2153                         d[3] = (int)((255.0f-pixel[x*4+3])*in4f[x*4+3] + pixel[x*4+3]);if (d[3] > 255) d[3] = 255;
2154                         pixel[x*4+0] = d[0];
2155                         pixel[x*4+1] = d[1];
2156                         pixel[x*4+2] = d[2];
2157                         pixel[x*4+3] = d[3];
2158                 }
2159                 break;
2160         }
2161 }
2162
2163 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2164 {
2165 #ifdef SSE2_PRESENT
2166         int x;
2167         int startx = span->startx;
2168         int endx = span->endx;
2169         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2170         unsigned char * RESTRICT pixelmask = span->pixelmask;
2171         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2172         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2173         if (!pixel)
2174                 return;
2175         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2176         pixeli += span->y * dpsoftrast.fb_width + span->x;
2177         // handle alphatest now (this affects depth writes too)
2178         if (thread->alphatest)
2179                 for (x = startx;x < endx;x++)
2180                         if (in4ub[x*4+3] < 0.5f)
2181                                 pixelmask[x] = false;
2182         // FIXME: this does not handle bigendian
2183         switch(thread->fb_blendmode)
2184         {
2185         case DPSOFTRAST_BLENDMODE_OPAQUE:
2186                 for (x = startx;x + 4 <= endx;)
2187                 {
2188                         if (*(const unsigned int *)&pixelmask[x] == 0x01010101)
2189                         {
2190                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2191                                 x += 4;
2192                         }
2193                         else
2194                         {
2195                                 if (pixelmask[x])
2196                                         pixeli[x] = ini[x];
2197                                 x++;
2198                         }
2199                 }
2200                 for (;x < endx;x++)
2201                         if (pixelmask[x])
2202                                 pixeli[x] = ini[x];
2203                 break;
2204         case DPSOFTRAST_BLENDMODE_ALPHA:
2205         #define FINISHBLEND(blend2, blend1) \
2206                 for (x = startx;x + 1 < endx;x += 2) \
2207                 { \
2208                         __m128i src, dst; \
2209                         switch (*(const unsigned short*)&pixelmask[x]) \
2210                         { \
2211                         case 0x0101: \
2212                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2213                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2214                                 blend2; \
2215                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2216                                 continue; \
2217                         case 0x0100: \
2218                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x+1]), _mm_setzero_si128()); \
2219                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x+1]), _mm_setzero_si128()); \
2220                                 blend1; \
2221                                 pixeli[x+1] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst));  \
2222                                 continue; \
2223                         case 0x0001: \
2224                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2225                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2226                                 blend1; \
2227                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2228                                 continue; \
2229                         } \
2230                         break; \
2231                 } \
2232                 for(;x < endx; x++) \
2233                 { \
2234                         __m128i src, dst; \
2235                         if (!pixelmask[x]) \
2236                                 continue; \
2237                         src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2238                         dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2239                         blend1; \
2240                         pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2241                 }
2242
2243                 FINISHBLEND({
2244                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2245                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2246                 }, {
2247                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2248                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2249                 });
2250                 break;
2251         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2252                 FINISHBLEND({
2253                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2254                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2255                 }, {
2256                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2257                         dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2258                 });
2259                 break;
2260         case DPSOFTRAST_BLENDMODE_ADD:
2261                 FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2262                 break;
2263         case DPSOFTRAST_BLENDMODE_INVMOD:
2264                 FINISHBLEND({
2265                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2266                 }, {
2267                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2268                 });
2269                 break;
2270         case DPSOFTRAST_BLENDMODE_MUL:
2271                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2272                 break;
2273         case DPSOFTRAST_BLENDMODE_MUL2:
2274                 FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2275                 break;
2276         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2277                 FINISHBLEND({
2278                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2279                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2280                 }, {
2281                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2282                         dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2283                 });
2284                 break;
2285         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2286                 FINISHBLEND({
2287                         __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2288                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2289                 }, {
2290                         __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2291                         dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2292                 });
2293                 break;
2294         case DPSOFTRAST_BLENDMODE_INVADD:
2295                 FINISHBLEND({
2296                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2297                 }, {
2298                         dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2299                 });
2300                 break;
2301         }
2302 #endif
2303 }
2304
2305 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2306 {
2307         int x;
2308         int startx = span->startx;
2309         int endx = span->endx;
2310         int flags;
2311         float c[4];
2312         float data[4];
2313         float slope[4];
2314         float tc[2], endtc[2];
2315         float tcscale[2];
2316         unsigned int tci[2];
2317         unsigned int tci1[2];
2318         unsigned int tcimin[2];
2319         unsigned int tcimax[2];
2320         int tciwrapmask[2];
2321         int tciwidth;
2322         int filter;
2323         int mip;
2324         const unsigned char * RESTRICT pixelbase;
2325         const unsigned char * RESTRICT pixel[4];
2326         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2327         // if no texture is bound, just fill it with white
2328         if (!texture)
2329         {
2330                 for (x = startx;x < endx;x++)
2331                 {
2332                         out4f[x*4+0] = 1.0f;
2333                         out4f[x*4+1] = 1.0f;
2334                         out4f[x*4+2] = 1.0f;
2335                         out4f[x*4+3] = 1.0f;
2336                 }
2337                 return;
2338         }
2339         mip = triangle->mip[texunitindex];
2340         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2341         // if this mipmap of the texture is 1 pixel, just fill it with that color
2342         if (texture->mipmap[mip][1] == 4)
2343         {
2344                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2345                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2346                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2347                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2348                 for (x = startx;x < endx;x++)
2349                 {
2350                         out4f[x*4+0] = c[0];
2351                         out4f[x*4+1] = c[1];
2352                         out4f[x*4+2] = c[2];
2353                         out4f[x*4+3] = c[3];
2354                 }
2355                 return;
2356         }
2357         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2358         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2359         flags = texture->flags;
2360         tcscale[0] = texture->mipmap[mip][2];
2361         tcscale[1] = texture->mipmap[mip][3];
2362         tciwidth = texture->mipmap[mip][2];
2363         tcimin[0] = 0;
2364         tcimin[1] = 0;
2365         tcimax[0] = texture->mipmap[mip][2]-1;
2366         tcimax[1] = texture->mipmap[mip][3]-1;
2367         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2368         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2369         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0] - 0.5f;
2370         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1] - 0.5f;
2371         for (x = startx;x < endx;)
2372         {
2373                 unsigned int subtc[2];
2374                 unsigned int substep[2];
2375                 float subscale = 65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2376                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2377                 if (nextsub >= endx)
2378                 {
2379                         nextsub = endsub = endx-1;      
2380                         if (x < nextsub) subscale = 65536.0f / (nextsub - x);
2381                 }
2382                 tc[0] = endtc[0];
2383                 tc[1] = endtc[1];
2384                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0] - 0.5f;
2385                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1] - 0.5f;
2386                 substep[0] = (endtc[0] - tc[0]) * subscale;
2387                 substep[1] = (endtc[1] - tc[1]) * subscale;
2388                 subtc[0] = tc[0] * (1<<16);
2389                 subtc[1] = tc[1] * (1<<16);
2390                 if (filter)
2391                 {
2392                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2393                         {
2394                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2395                                 {
2396                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2397                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2398                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2399                                         tci[0] = subtc[0]>>16;
2400                                         tci[1] = subtc[1]>>16;
2401                                         tci1[0] = tci[0] + 1;
2402                                         tci1[1] = tci[1] + 1;
2403                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2404                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2405                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2406                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2407                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2408                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2409                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2410                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2411                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2412                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2413                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2414                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2415                                         out4f[x*4+0] = c[0];
2416                                         out4f[x*4+1] = c[1];
2417                                         out4f[x*4+2] = c[2];
2418                                         out4f[x*4+3] = c[3];
2419                                 }
2420                         }
2421                         else
2422                         {
2423                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2424                                 {
2425                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2426                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2427                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2428                                         tci[0] = subtc[0]>>16;
2429                                         tci[1] = subtc[1]>>16;
2430                                         tci1[0] = tci[0] + 1;
2431                                         tci1[1] = tci[1] + 1;
2432                                         tci[0] &= tciwrapmask[0];
2433                                         tci[1] &= tciwrapmask[1];
2434                                         tci1[0] &= tciwrapmask[0];
2435                                         tci1[1] &= tciwrapmask[1];
2436                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2437                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2438                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2439                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2440                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2441                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2442                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2443                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2444                                         out4f[x*4+0] = c[0];
2445                                         out4f[x*4+1] = c[1];
2446                                         out4f[x*4+2] = c[2];
2447                                         out4f[x*4+3] = c[3];
2448                                 }
2449                         }
2450                 }
2451                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2452                 {
2453                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2454                         {
2455                                 tci[0] = subtc[0]>>16;
2456                                 tci[1] = subtc[1]>>16;
2457                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2458                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2459                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2460                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2461                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2462                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2463                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2464                                 out4f[x*4+0] = c[0];
2465                                 out4f[x*4+1] = c[1];
2466                                 out4f[x*4+2] = c[2];
2467                                 out4f[x*4+3] = c[3];
2468                         }
2469                 }
2470                 else
2471                 {
2472                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2473                         {
2474                                 tci[0] = subtc[0]>>16;
2475                                 tci[1] = subtc[1]>>16;
2476                                 tci[0] &= tciwrapmask[0];
2477                                 tci[1] &= tciwrapmask[1];
2478                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2479                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2480                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2481                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2482                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2483                                 out4f[x*4+0] = c[0];
2484                                 out4f[x*4+1] = c[1];
2485                                 out4f[x*4+2] = c[2];
2486                                 out4f[x*4+3] = c[3];
2487                         }
2488                 }
2489         }
2490 }
2491
2492 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2493 {
2494 #ifdef SSE2_PRESENT
2495         int x;
2496         int startx = span->startx;
2497         int endx = span->endx;
2498         int flags;
2499         __m128 data, slope, tcscale;
2500         __m128i tcsize, tcmask, tcoffset, tcmax;
2501         __m128 tc, endtc;
2502         __m128i subtc, substep, endsubtc;
2503         int filter;
2504         int mip;
2505         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2506         const unsigned char * RESTRICT pixelbase;
2507         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2508         // if no texture is bound, just fill it with white
2509         if (!texture)
2510         {
2511                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2512                 return;
2513         }
2514         mip = triangle->mip[texunitindex];
2515         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2516         // if this mipmap of the texture is 1 pixel, just fill it with that color
2517         if (texture->mipmap[mip][1] == 4)
2518         {
2519                 unsigned int k = *((const unsigned int *)pixelbase);
2520                 for (x = startx;x < endx;x++)
2521                         outi[x] = k;
2522                 return;
2523         }
2524         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2525         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2526         flags = texture->flags;
2527         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2528         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2529         tcscale = _mm_cvtepi32_ps(tcsize);
2530         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2531         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2532         endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx])), _mm_set1_ps(0.5f));
2533         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2534         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2535         tcmax = _mm_packs_epi32(tcmask, tcmask);
2536         for (x = startx;x < endx;)
2537         {
2538                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2539                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2540                 if (nextsub >= endx)
2541                 {
2542                         nextsub = endsub = endx-1;
2543                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2544                 }       
2545                 tc = endtc;
2546                 subtc = endsubtc;
2547                 endtc = _mm_sub_ps(_mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub])), _mm_set1_ps(0.5f));
2548                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2549                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2550                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2551                 substep = _mm_slli_epi32(substep, 1);
2552                 if (filter)
2553                 {
2554                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2555                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2556                         {
2557                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2558                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2559                                 {
2560                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2561                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2562                                         tci = _mm_madd_epi16(tci, tcoffset);
2563                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2564                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2565                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2566                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2567                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2568                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2569                                         fracm = _mm_srli_epi16(subtc, 1);
2570                                         pix1 = _mm_add_epi16(pix1,
2571                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2572                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2573                                         pix3 = _mm_add_epi16(pix3,
2574                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2575                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2576                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2577                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2578                                         pix2 = _mm_add_epi16(pix2,
2579                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2580                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2581                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2582                                 }
2583                                 if (x <= endsub)
2584                                 {
2585                                         const unsigned char * RESTRICT ptr1;
2586                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2587                                         tci = _mm_madd_epi16(tci, tcoffset);
2588                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2589                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2590                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2591                                         fracm = _mm_srli_epi16(subtc, 1);
2592                                         pix1 = _mm_add_epi16(pix1,
2593                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2594                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2595                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2596                                         pix1 = _mm_add_epi16(pix1,
2597                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2598                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2599                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2600                                         x++;
2601                                 }
2602                         }
2603                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2604                         {
2605                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2606                                 {
2607                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2608                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2609                                         tci = _mm_madd_epi16(tci, tcoffset);
2610                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2611                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2612                                                                                         _mm_setzero_si128());
2613                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2614                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2615                                                                                         _mm_setzero_si128());
2616                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2617                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2618                                         tci = _mm_madd_epi16(tci, tcoffset);
2619                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2620                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2621                                                                                         _mm_setzero_si128());
2622                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2623                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2624                                                                                         _mm_setzero_si128());
2625                                         fracm = _mm_srli_epi16(subtc, 1);
2626                                         pix1 = _mm_add_epi16(pix1,
2627                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2628                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2629                                         pix3 = _mm_add_epi16(pix3,
2630                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2631                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2632                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2633                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2634                                         pix2 = _mm_add_epi16(pix2,
2635                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2636                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2637                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2638                                 }
2639                                 if (x <= endsub)
2640                                 {
2641                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2642                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2643                                         tci = _mm_madd_epi16(tci, tcoffset);
2644                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2645                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2646                                                                                         _mm_setzero_si128());
2647                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2648                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2649                                                                                         _mm_setzero_si128());
2650                                         fracm = _mm_srli_epi16(subtc, 1);
2651                                         pix1 = _mm_add_epi16(pix1,
2652                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2653                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2654                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2655                                         pix1 = _mm_add_epi16(pix1,
2656                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2657                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2658                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2659                                         x++;
2660                                 }
2661                         }
2662                         else
2663                         {
2664                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2665                                 {
2666                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2667                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2668                                         tci = _mm_madd_epi16(tci, tcoffset);
2669                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2670                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2671                                                                                         _mm_setzero_si128());
2672                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2673                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2674                                                                                         _mm_setzero_si128());
2675                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2676                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2677                                         tci = _mm_madd_epi16(tci, tcoffset);
2678                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2679                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2680                                                                                         _mm_setzero_si128());
2681                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2682                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2683                                                                                         _mm_setzero_si128());
2684                                         fracm = _mm_srli_epi16(subtc, 1);
2685                                         pix1 = _mm_add_epi16(pix1,
2686                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2687                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2688                                         pix3 = _mm_add_epi16(pix3,
2689                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2690                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2691                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2692                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2693                                         pix2 = _mm_add_epi16(pix2,
2694                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2695                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2696                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2697                                 }
2698                                 if (x <= endsub)
2699                                 {
2700                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2701                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2702                                         tci = _mm_madd_epi16(tci, tcoffset);
2703                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2704                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2705                                                                                         _mm_setzero_si128());
2706                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2707                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2708                                                                                         _mm_setzero_si128());
2709                                         fracm = _mm_srli_epi16(subtc, 1);
2710                                         pix1 = _mm_add_epi16(pix1,
2711                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2712                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2713                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2714                                         pix1 = _mm_add_epi16(pix1,
2715                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2716                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2717                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2718                                         x++;
2719                                 }
2720                         }
2721                 }
2722                 else
2723                 {
2724                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2725                         {
2726                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2727                                 {
2728                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2729                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2730                                         tci = _mm_madd_epi16(tci, tcoffset);
2731                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2732                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2733                                 }
2734                                 if (x <= endsub)
2735                                 {
2736                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2737                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2738                                         tci = _mm_madd_epi16(tci, tcoffset);
2739                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2740                                         x++;
2741                                 }
2742                         }
2743                         else
2744                         {
2745                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2746                                 {
2747                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2748                                         tci = _mm_and_si128(tci, tcmax); 
2749                                         tci = _mm_madd_epi16(tci, tcoffset);
2750                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2751                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2752                                 }
2753                                 if (x <= endsub)
2754                                 {
2755                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2756                                         tci = _mm_and_si128(tci, tcmax); 
2757                                         tci = _mm_madd_epi16(tci, tcoffset);
2758                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2759                                         x++;
2760                                 }
2761                         }
2762                 }
2763         }
2764 #endif
2765 }
2766
2767 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2768 {
2769         // TODO: IMPLEMENT
2770         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2771 }
2772
2773 float DPSOFTRAST_SampleShadowmap(const float *vector)
2774 {
2775         // TODO: IMPLEMENT
2776         return 1.0f;
2777 }
2778
2779 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2780 {
2781         int x;
2782         int startx = span->startx;
2783         int endx = span->endx;
2784         float c[4];
2785         float data[4];
2786         float slope[4];
2787         float z;
2788         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2789         for (x = startx;x < endx;x++)
2790         {
2791                 z = zf[x];
2792                 c[0] = (data[0] + slope[0]*x) * z;
2793                 c[1] = (data[1] + slope[1]*x) * z;
2794                 c[2] = (data[2] + slope[2]*x) * z;
2795                 c[3] = (data[3] + slope[3]*x) * z;
2796                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2797                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2798                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2799                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2800         }
2801 }
2802
2803 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2804 {
2805         int x;
2806         int startx = span->startx;
2807         int endx = span->endx;
2808         float c[4];
2809         float data[4];
2810         float slope[4];
2811         float z;
2812         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2813         for (x = startx;x < endx;x++)
2814         {
2815                 z = zf[x];
2816                 c[0] = (data[0] + slope[0]*x) * z;
2817                 c[1] = (data[1] + slope[1]*x) * z;
2818                 c[2] = (data[2] + slope[2]*x) * z;
2819                 c[3] = (data[3] + slope[3]*x) * z;
2820                 out4f[x*4+0] = c[0];
2821                 out4f[x*4+1] = c[1];
2822                 out4f[x*4+2] = c[2];
2823                 out4f[x*4+3] = c[3];
2824         }
2825 }
2826
2827 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2828 {
2829         int x, startx = span->startx, endx = span->endx;
2830         float c[4], localcolor[4];
2831         localcolor[0] = subcolor[0];
2832         localcolor[1] = subcolor[1];
2833         localcolor[2] = subcolor[2];
2834         localcolor[3] = subcolor[3];
2835         for (x = startx;x < endx;x++)
2836         {
2837                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2838                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2839                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2840                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2841                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2842                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2843                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2844                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2845         }
2846 }
2847
2848 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2849 {
2850         int x, startx = span->startx, endx = span->endx;
2851         for (x = startx;x < endx;x++)
2852         {
2853                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2854                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2855                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2856                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2857         }
2858 }
2859
2860 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2861 {
2862         int x, startx = span->startx, endx = span->endx;
2863         for (x = startx;x < endx;x++)
2864         {
2865                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2866                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2867                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2868                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2869         }
2870 }
2871
2872 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2873 {
2874         int x, startx = span->startx, endx = span->endx;
2875         float a, b;
2876         for (x = startx;x < endx;x++)
2877         {
2878                 a = 1.0f - inb4f[x*4+3];
2879                 b = inb4f[x*4+3];
2880                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2881                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2882                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2883                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2884         }
2885 }
2886
2887 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2888 {
2889         int x, startx = span->startx, endx = span->endx;
2890         float localcolor[4], ilerp, lerp;
2891         localcolor[0] = color[0];
2892         localcolor[1] = color[1];
2893         localcolor[2] = color[2];
2894         localcolor[3] = color[3];
2895         ilerp = 1.0f - localcolor[3];
2896         lerp = localcolor[3];
2897         for (x = startx;x < endx;x++)
2898         {
2899                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2900                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2901                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2902                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2903         }
2904 }
2905
2906
2907
2908 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2909 {
2910 #ifdef SSE2_PRESENT
2911         int x;
2912         int startx = span->startx;
2913         int endx = span->endx;
2914         __m128 data, slope;
2915         __m128 mod, endmod;
2916         __m128i submod, substep, endsubmod;
2917         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2918         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2919         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2920         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2921         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2922         for (x = startx; x < endx;)
2923         {
2924                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2925                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2926                 if (nextsub >= endx)
2927                 {
2928                         nextsub = endsub = endx-1;
2929                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2930                 }
2931                 mod = endmod;
2932                 submod = endsubmod;
2933                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2934                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2935                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2936                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2937                 substep = _mm_packs_epi32(substep, substep);
2938                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2939                 {
2940                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2941                         pix = _mm_mulhi_epu16(pix, submod);
2942                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2943                 }
2944                 if (x <= endsub)
2945                 {
2946                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
2947                         pix = _mm_mulhi_epu16(pix, submod);
2948                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2949                         x++;
2950                 }
2951         }
2952 #endif
2953 }
2954
2955 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
2956 {
2957 #ifdef SSE2_PRESENT
2958         int x;
2959         int startx = span->startx;
2960         int endx = span->endx;
2961         __m128 data, slope;
2962         __m128 mod, endmod;
2963         __m128i submod, substep, endsubmod;
2964         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2965         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2966         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2967         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2968         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2969         for (x = startx; x < endx;)
2970         {
2971                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2972                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2973                 if (nextsub >= endx)
2974                 {
2975                         nextsub = endsub = endx-1;
2976                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
2977                 }
2978                 mod = endmod;
2979                 submod = endsubmod;
2980                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2981                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2982                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
2983                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2984                 substep = _mm_packs_epi32(substep, substep);
2985                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2986                 {
2987                         __m128i pix = _mm_srai_epi16(submod, 4);
2988                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
2989                 }
2990                 if (x <= endsub)
2991                 {
2992                         __m128i pix = _mm_srai_epi16(submod, 4);
2993                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
2994                         x++;
2995                 }
2996         }
2997 #endif
2998 }
2999
3000 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3001 {
3002 #ifdef SSE2_PRESENT
3003         int x, startx = span->startx, endx = span->endx;
3004         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3005         localcolor = _mm_packs_epi32(localcolor, localcolor);
3006         for (x = startx;x+2 <= endx;x+=2)
3007         {
3008                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3009                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3010                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3011                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3012         }
3013         if (x < endx)
3014         {
3015                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3016                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3017                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3018                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3019         }
3020 #endif
3021 }
3022
3023 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3024 {
3025 #ifdef SSE2_PRESENT
3026         int x, startx = span->startx, endx = span->endx;
3027         for (x = startx;x+2 <= endx;x+=2)
3028         {
3029                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3030                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3031                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3032                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3033         }
3034         if (x < endx)
3035         {
3036                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3037                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3038                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3039                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3040         }
3041 #endif
3042 }
3043
3044 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3045 {
3046 #ifdef SSE2_PRESENT
3047         int x, startx = span->startx, endx = span->endx;
3048         for (x = startx;x+2 <= endx;x+=2)
3049         {
3050                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3051                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3052                 pix1 = _mm_add_epi16(pix1, pix2);
3053                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3054         }
3055         if (x < endx)
3056         {
3057                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3058                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3059                 pix1 = _mm_add_epi16(pix1, pix2);
3060                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3061         }
3062 #endif
3063 }
3064
3065 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3066 {
3067 #ifdef SSE2_PRESENT
3068         int x, startx = span->startx, endx = span->endx;
3069         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3070         tint = _mm_packs_epi32(tint, tint);
3071         for (x = startx;x+2 <= endx;x+=2)
3072         {
3073                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3074                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3075                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3076                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3077         }
3078         if (x < endx)
3079         {
3080                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3081                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3082                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3083                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3084         }
3085 #endif
3086 }
3087
3088 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3089 {
3090 #ifdef SSE2_PRESENT
3091         int x, startx = span->startx, endx = span->endx;
3092         for (x = startx;x+2 <= endx;x+=2)
3093         {
3094                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3095                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3096                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3097                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3098                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3099         }
3100         if (x < endx)
3101         {
3102                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3103                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3104                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3105                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3106                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3107         }
3108 #endif
3109 }
3110
3111 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3112 {
3113 #ifdef SSE2_PRESENT
3114         int x, startx = span->startx, endx = span->endx;
3115         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3116         localcolor = _mm_packs_epi32(localcolor, localcolor);
3117         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3118         for (x = startx;x+2 <= endx;x+=2)
3119         {
3120                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3121                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3122                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3123         }
3124         if (x < endx)
3125         {
3126                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3127                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3128                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3129         }
3130 #endif
3131 }
3132
3133
3134
3135 void DPSOFTRAST_VertexShader_Generic(void)
3136 {
3137         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3138         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3139         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3140         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3141                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3142 }
3143
3144 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3145 {
3146         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3147         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3148         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3149         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3150         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3151         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3152         {
3153                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3154                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3155                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3156                 {
3157                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3158                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3159                         {
3160                                 // multiply
3161                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3162                         }
3163                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3164                         {
3165                                 // add
3166                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3167                         }
3168                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3169                         {
3170                                 // alphablend
3171                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3172                         }
3173                 }
3174         }
3175         else
3176                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3177         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3178 }
3179
3180
3181
3182 void DPSOFTRAST_VertexShader_PostProcess(void)
3183 {
3184         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3185         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3186         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3187 }
3188
3189 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3190 {
3191         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3192         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3193         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3194         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3195         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3196         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3197         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3198         {
3199                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3200                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3201         }
3202         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3203         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3204         {
3205                 // TODO: implement saturation
3206         }
3207         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3208         {
3209                 // TODO: implement gammaramps
3210         }
3211         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3212 }
3213
3214
3215
3216 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3217 {
3218         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3219 }
3220
3221 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3222 {
3223         // this is never called (because colormask is off when this shader is used)
3224         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3225         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3226         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3227         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3228         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3229 }
3230
3231
3232
3233 void DPSOFTRAST_VertexShader_FlatColor(void)
3234 {
3235         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3236         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3237 }
3238
3239 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3240 {
3241 #ifdef SSE2_PRESENT
3242         unsigned char * RESTRICT pixelmask = span->pixelmask;
3243         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3244         int x, startx = span->startx, endx = span->endx;
3245         __m128i Color_Ambientm;
3246         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3247         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3248         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3249         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3250         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3251         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3252                 pixel = buffer_FragColorbgra8;
3253         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3254         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3255         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3256         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3257         for (x = startx;x < endx;x++)
3258         {
3259                 __m128i color, pix;
3260                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3261                 {
3262                         __m128i pix2;
3263                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3264                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3265                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3266                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3267                         x += 3;
3268                         continue;
3269                 }
3270                 if (!pixelmask[x])
3271                         continue;
3272                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3273                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3274                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3275         }
3276         if (pixel == buffer_FragColorbgra8)
3277                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3278 #endif
3279 }
3280
3281
3282
3283 void DPSOFTRAST_VertexShader_VertexColor(void)
3284 {
3285         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3286         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3287         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3288 }
3289
3290 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3291 {
3292 #ifdef SSE2_PRESENT
3293         unsigned char * RESTRICT pixelmask = span->pixelmask;
3294         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3295         int x, startx = span->startx, endx = span->endx;
3296         __m128i Color_Ambientm, Color_Diffusem;
3297         __m128 data, slope;
3298         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3299         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3300         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3301         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3302         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3303         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3304         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3305                 pixel = buffer_FragColorbgra8;
3306         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3307         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3308         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3309         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3310         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3311         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3312         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3313         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3314         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3315         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3316         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3317         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3318         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3319         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3320         {
3321                 __m128i color, mod, pix;
3322                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3323                 {
3324                         __m128i pix2, mod2;
3325                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3326                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3327                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3328                         data = _mm_add_ps(data, slope);
3329                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3330                         data = _mm_add_ps(data, slope);
3331                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3332                         data = _mm_add_ps(data, slope);
3333                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3334                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3335                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3336                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3337                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3338                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3339                         x += 3;
3340                         continue;
3341                 }
3342                 if (!pixelmask[x])
3343                         continue;
3344                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3345                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3346                 mod = _mm_packs_epi32(mod, mod);
3347                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3348                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3349         }
3350         if (pixel == buffer_FragColorbgra8)
3351                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3352 #endif
3353 }
3354
3355
3356
3357 void DPSOFTRAST_VertexShader_Lightmap(void)
3358 {
3359         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3360         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3361         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3362 }
3363
3364 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3365 {
3366 #ifdef SSE2_PRESENT
3367         unsigned char * RESTRICT pixelmask = span->pixelmask;
3368         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3369         int x, startx = span->startx, endx = span->endx;
3370         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3371         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3372         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3373         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3374         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3375         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3376         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3377         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3378         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3379         if (thread->alphatest || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3380                 pixel = buffer_FragColorbgra8;
3381         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3382         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3383         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3384         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3385         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3386         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3387         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3388         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3389         {
3390                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3391                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3392                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3393                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3394                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3395                 for (x = startx;x < endx;x++)
3396                 {
3397                         __m128i color, lightmap, glow, pix;
3398                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3399                         {
3400                                 __m128i pix2;
3401                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3402                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3403                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3404                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3405                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3406                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3407                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3408                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3409                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3410                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3411                                 x += 3;
3412                                 continue;
3413                         }
3414                         if (!pixelmask[x])
3415                                 continue;
3416                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3417                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3418                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3419                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3420                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3421                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3422                 }
3423         }
3424         else
3425         {
3426                 for (x = startx;x < endx;x++)
3427                 {
3428                         __m128i color, lightmap, pix;
3429                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3430                         {
3431                                 __m128i pix2;
3432                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3433                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3434                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3435                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3436                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3437                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3438                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3439                                 x += 3;
3440                                 continue;
3441                         }
3442                         if (!pixelmask[x]) 
3443                                 continue;
3444                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3445                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3446                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3447                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3448                 }
3449         }
3450         if (pixel == buffer_FragColorbgra8)
3451                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3452 #endif
3453 }
3454
3455
3456 void DPSOFTRAST_VertexShader_LightDirection(void);
3457 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3458
3459 void DPSOFTRAST_VertexShader_FakeLight(void)
3460 {
3461         DPSOFTRAST_VertexShader_LightDirection();
3462 }
3463
3464 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3465 {
3466         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3467 }
3468
3469
3470
3471 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3472 {
3473         DPSOFTRAST_VertexShader_LightDirection();
3474         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3475 }
3476
3477 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3478 {
3479         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3480 }
3481
3482
3483
3484 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3485 {
3486         DPSOFTRAST_VertexShader_LightDirection();
3487         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3488 }
3489
3490 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3491 {
3492         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3493 }
3494
3495
3496
3497 void DPSOFTRAST_VertexShader_LightDirection(void)
3498 {
3499         int i;
3500         int numvertices = dpsoftrast.numvertices;
3501         float LightDir[4];
3502         float LightVector[4];
3503         float EyePosition[4];
3504         float EyeVectorModelSpace[4];
3505         float EyeVector[4];
3506         float position[4];
3507         float svector[4];
3508         float tvector[4];
3509         float normal[4];
3510         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3511         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3512         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3513         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3514         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3515         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3516         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3517         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3518         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3519         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3520         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3521         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3522         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3523         for (i = 0;i < numvertices;i++)
3524         {
3525                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3526                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3527                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3528                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3529                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3530                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3531                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3532                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3533                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3534                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3535                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3536                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3537                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3538                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3539                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3540                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3541                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3542                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3543                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3544                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3545                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3546                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3547                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3548                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3549                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3550                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3551                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3552                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3553                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3554         }
3555         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3556 }
3557
3558 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3559 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3560 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3561 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3562 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3563 #define DPSOFTRAST_Vector3Normalize(v)\
3564 do\
3565 {\
3566         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3567         if (len)\
3568         {\
3569                 len = 1.0f / len;\
3570                 v[0] *= len;\
3571                 v[1] *= len;\
3572                 v[2] *= len;\
3573         }\
3574 }\
3575 while(0)
3576
3577 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3578 {
3579         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3580         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3581         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3582         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3583         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3584         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3585         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3586         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3587         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3588         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3589         int x, startx = span->startx, endx = span->endx;
3590         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3591         float LightVectordata[4];
3592         float LightVectorslope[4];
3593         float EyeVectordata[4];
3594         float EyeVectorslope[4];
3595         float VectorSdata[4];
3596         float VectorSslope[4];
3597         float VectorTdata[4];
3598         float VectorTslope[4];
3599         float VectorRdata[4];
3600         float VectorRslope[4];
3601         float z;
3602         float diffusetex[4];
3603         float glosstex[4];
3604         float surfacenormal[4];
3605         float lightnormal[4];
3606         float lightnormal_modelspace[4];
3607         float eyenormal[4];
3608         float specularnormal[4];
3609         float diffuse;
3610         float specular;
3611         float SpecularPower;
3612         int d[4];
3613         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3614         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3615         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3616         Color_Glow[3] = 0.0f;
3617         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3618         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3619         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3620         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3621         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3622         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3623         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3624         Color_Pants[3] = 0.0f;
3625         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3626         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3627         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3628         Color_Shirt[3] = 0.0f;
3629         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3630         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3631         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3632         {
3633                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3634                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3635         }
3636         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3637         {
3638                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3639         }
3640         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3641         {
3642                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3643                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3644                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3645                 Color_Diffuse[3] = 0.0f;
3646                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3647                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3648                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3649                 LightColor[3] = 0.0f;
3650                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3651                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3652                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3653                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3654                 Color_Specular[3] = 0.0f;
3655                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3656                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3657                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3658
3659                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3660                 {
3661                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3662                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3663                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3664                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3665                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3666                 }
3667                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3668                 {
3669                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3670                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3671                 }
3672                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3673                 {
3674                         // nothing of this needed
3675                 }
3676                 else
3677                 {
3678                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3679                 }
3680
3681                 for (x = startx;x < endx;x++)
3682                 {
3683                         z = buffer_z[x];
3684                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3685                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3686                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3687                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3688                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3689                         {
3690                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3691                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3692                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3693                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3694                         }
3695                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3696                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3697                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3698                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3699                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3700                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3701                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3702                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3703
3704                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3705                         {
3706                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3707                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3708                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3709                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3710
3711                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3712                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3713                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3714                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3715
3716                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3717                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3718                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3719                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3720
3721                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3722                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3723                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3724                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3725
3726                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3727                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3728
3729                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3730                                 {
3731                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3732                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3733                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3734                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3735                                 }
3736                         }
3737                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3738                         {
3739                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3740                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3741                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3742                                 {
3743                                         float f = 1.0f / 256.0f;
3744                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3745                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3746                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3747                                 }
3748                         }
3749                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3750                         {
3751                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3752                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3753                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3754                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3755
3756                                 LightColor[0] = 1.0;
3757                                 LightColor[1] = 1.0;
3758                                 LightColor[2] = 1.0;
3759                         }
3760                         else
3761                         {
3762                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3763                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3764                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3765                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3766                         }
3767
3768                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3769
3770                         if(thread->shader_exactspecularmath)
3771                         {
3772                                 // reflect lightnormal at surfacenormal, take the negative of that
3773                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3774                                 float f;
3775                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3776                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3777                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3778                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3779
3780                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3781                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3782                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3783                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3784                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3785
3786                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3787                         }
3788                         else
3789                         {
3790                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3791                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3792                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3793                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3794
3795                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3796                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3797                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3798                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3799
3800                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3801                         }
3802
3803                         specular = pow(specular, SpecularPower * glosstex[3]);
3804                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3805                         {
3806                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3807                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3808                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3809                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3810                         }
3811                         else
3812                         {
3813                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3814                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3815                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3816                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3817                         }
3818
3819                         buffer_FragColorbgra8[x*4+0] = d[0];
3820                         buffer_FragColorbgra8[x*4+1] = d[1];
3821                         buffer_FragColorbgra8[x*4+2] = d[2];
3822                         buffer_FragColorbgra8[x*4+3] = d[3];
3823                 }
3824         }
3825         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3826         {
3827                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3828                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3829                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3830                 Color_Diffuse[3] = 0.0f;
3831                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3832                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3833                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3834                 LightColor[3] = 0.0f;
3835                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3836
3837                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3838                 {
3839                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3840                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3841                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3842                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3843                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3844                 }
3845                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3846                 {
3847                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3848                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3849                 }
3850                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3851                 {
3852                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3853                 }
3854                 else
3855                 {
3856                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3857                 }
3858
3859                 for (x = startx;x < endx;x++)
3860                 {
3861                         z = buffer_z[x];
3862                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3863                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3864                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3865                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3866                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3867                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3868                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3869                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3870
3871                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3872                         {
3873                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3874                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3875                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3876                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3877
3878                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3879                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3880                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3881                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3882
3883                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3884                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3885                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3886                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3887
3888                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3889                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3890                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3891                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3892
3893                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3894                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3895
3896                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3897                                 {
3898                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3899                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3900                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3901                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3902                                 }
3903                         }
3904                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3905                         {
3906                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3907                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3908                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3909                                 {
3910                                         float f = 1.0f / 256.0f;
3911                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3912                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3913                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3914                                 }
3915                         }
3916                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3917                         {
3918                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3919                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3920                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3921                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3922
3923                                 LightColor[0] = 1.0;
3924                                 LightColor[1] = 1.0;
3925                                 LightColor[2] = 1.0;
3926                         }
3927                         else
3928                         {
3929                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3930                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3931                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3932                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3933                         }
3934
3935                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3936                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3937                         {
3938                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3939                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3940                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3941                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3942                         }
3943                         else
3944                         {
3945                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3946                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3947                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
3948                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
3949                         }
3950                         buffer_FragColorbgra8[x*4+0] = d[0];
3951                         buffer_FragColorbgra8[x*4+1] = d[1];
3952                         buffer_FragColorbgra8[x*4+2] = d[2];
3953                         buffer_FragColorbgra8[x*4+3] = d[3];
3954                 }
3955         }
3956         else
3957         {
3958                 for (x = startx;x < endx;x++)
3959                 {
3960                         z = buffer_z[x];
3961                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3962                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3963                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3964                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3965
3966                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3967                         {
3968                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3969                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3970                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3971                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3972                         }
3973                         else
3974                         {
3975                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
3976                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
3977                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
3978                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3979                         }
3980                         buffer_FragColorbgra8[x*4+0] = d[0];
3981                         buffer_FragColorbgra8[x*4+1] = d[1];
3982                         buffer_FragColorbgra8[x*4+2] = d[2];
3983                         buffer_FragColorbgra8[x*4+3] = d[3];
3984                 }
3985         }
3986         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3987 }
3988
3989
3990
3991 void DPSOFTRAST_VertexShader_LightSource(void)
3992 {
3993         int i;
3994         int numvertices = dpsoftrast.numvertices;
3995         float LightPosition[4];
3996         float LightVector[4];
3997         float LightVectorModelSpace[4];
3998         float EyePosition[4];
3999         float EyeVectorModelSpace[4];
4000         float EyeVector[4];
4001         float position[4];
4002         float svector[4];
4003         float tvector[4];
4004         float normal[4];
4005         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4006         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4007         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4008         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4009         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4010         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4011         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4012         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4013         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4014         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4015         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4016         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4017         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4018         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4019         for (i = 0;i < numvertices;i++)
4020         {
4021                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4022                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4023                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4024                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4025                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4026                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4027                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4028                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4029                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4030                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4031                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4032                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4033                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4034                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4035                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4036                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4037                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4038                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4039                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4040                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4041                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4042                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4043                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4044                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4045                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4046                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4047                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4048                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4049                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4050                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4051                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4052                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4053         }
4054         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4055         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4056 }
4057
4058 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4059 {
4060 #ifdef SSE2_PRESENT
4061         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4062         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4063         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4064         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4065         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4066         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4067         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4068         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4069         int x, startx = span->startx, endx = span->endx;
4070         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4071         float CubeVectordata[4];
4072         float CubeVectorslope[4];
4073         float LightVectordata[4];
4074         float LightVectorslope[4];
4075         float EyeVectordata[4];
4076         float EyeVectorslope[4];
4077         float z;
4078         float diffusetex[4];
4079         float glosstex[4];
4080         float surfacenormal[4];
4081         float lightnormal[4];
4082         float eyenormal[4];
4083         float specularnormal[4];
4084         float diffuse;
4085         float specular;
4086         float SpecularPower;
4087         float CubeVector[4];
4088         float attenuation;
4089         int d[4];
4090         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4091         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4092         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4093         Color_Glow[3] = 0.0f;
4094         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4095         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4096         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4097         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4098         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4099         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4100         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4101         Color_Diffuse[3] = 0.0f;
4102         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4103         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4104         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4105         Color_Specular[3] = 0.0f;
4106         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4107         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4108         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4109         Color_Pants[3] = 0.0f;
4110         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4111         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4112         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4113         Color_Shirt[3] = 0.0f;
4114         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4115         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4116         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4117         LightColor[3] = 0.0f;
4118         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4119         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4120         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4121         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4122         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4123         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4124         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4125         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4126         {
4127                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4128                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4129         }
4130         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4131                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4132         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4133         {
4134                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4135                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4136                 for (x = startx;x < endx;x++)
4137                 {
4138                         z = buffer_z[x];
4139                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4140                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4141                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4142                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4143                         if (attenuation < 0.01f)
4144                                 continue;
4145                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4146                         {
4147                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4148                                 if (attenuation < 0.01f)
4149                                         continue;
4150                         }
4151
4152                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4153                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4154                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4155                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4156                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4157                         {
4158                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4159                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4160                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4161                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4162                         }
4163                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4164                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4165                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4166                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4167                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4168                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4169                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4170                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4171
4172                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4173                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4174                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4175                         DPSOFTRAST_Vector3Normalize(lightnormal);
4176
4177                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4178
4179                         if(thread->shader_exactspecularmath)
4180                         {
4181                                 // reflect lightnormal at surfacenormal, take the negative of that
4182                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4183                                 float f;
4184                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4185                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4186                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4187                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4188
4189                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4190                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4191                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4192                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4193                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4194
4195                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4196                         }
4197                         else
4198                         {
4199                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4200                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4201                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4202                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4203
4204                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4205                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4206                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4207                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4208
4209                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4210                         }
4211                         specular = pow(specular, SpecularPower * glosstex[3]);
4212
4213                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4214                         {
4215                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4216                                 attenuation *= (1.0f / 255.0f);
4217                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4218                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4219                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4220                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4221                         }
4222                         else
4223                         {
4224                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4225                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4226                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4227                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4228                         }
4229                         buffer_FragColorbgra8[x*4+0] = d[0];
4230                         buffer_FragColorbgra8[x*4+1] = d[1];
4231                         buffer_FragColorbgra8[x*4+2] = d[2];
4232                         buffer_FragColorbgra8[x*4+3] = d[3];
4233                 }
4234         }
4235         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4236         {
4237                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4238                 for (x = startx;x < endx;x++)
4239                 {
4240                         z = buffer_z[x];
4241                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4242                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4243                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4244                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4245                         if (attenuation < 0.01f)
4246                                 continue;
4247                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4248                         {
4249                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4250                                 if (attenuation < 0.01f)
4251                                         continue;
4252                         }
4253
4254                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4255                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4256                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4257                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4258                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4259                         {
4260                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4261                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4262                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4263                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4264                         }
4265                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4266                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4267                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4268                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4269
4270                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4271                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4272                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4273                         DPSOFTRAST_Vector3Normalize(lightnormal);
4274
4275                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4276                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4277                         {
4278                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4279                                 attenuation *= (1.0f / 255.0f);
4280                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4281                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4282                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4283                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4284                         }
4285                         else
4286                         {
4287                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4288                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4289                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4290                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4291                         }
4292                         buffer_FragColorbgra8[x*4+0] = d[0];
4293                         buffer_FragColorbgra8[x*4+1] = d[1];
4294                         buffer_FragColorbgra8[x*4+2] = d[2];
4295                         buffer_FragColorbgra8[x*4+3] = d[3];
4296                 }
4297         }
4298         else
4299         {
4300                 for (x = startx;x < endx;x++)
4301                 {
4302                         z = buffer_z[x];
4303                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4304                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4305                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4306                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4307                         if (attenuation < 0.01f)
4308                                 continue;
4309                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4310                         {
4311                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4312                                 if (attenuation < 0.01f)
4313                                         continue;
4314                         }
4315
4316                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4317                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4318                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4319                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4320                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4321                         {
4322                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4323                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4324                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4325                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4326                         }
4327                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4328                         {
4329                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4330                                 attenuation *= (1.0f / 255.0f);
4331                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4332                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4333                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4334                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4335                         }
4336                         else
4337                         {
4338                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4339                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4340                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4341                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4342                         }
4343                         buffer_FragColorbgra8[x*4+0] = d[0];
4344                         buffer_FragColorbgra8[x*4+1] = d[1];
4345                         buffer_FragColorbgra8[x*4+2] = d[2];
4346                         buffer_FragColorbgra8[x*4+3] = d[3];
4347                 }
4348         }
4349         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4350 #endif
4351 }
4352
4353
4354
4355 void DPSOFTRAST_VertexShader_Refraction(void)
4356 {
4357         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4358         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4359         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4360 }
4361
4362 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4363 {
4364         // DIRTY TRICK: only do sideways displacement. Not correct, but cheaper and thus better for SW.
4365
4366         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4367         float z;
4368         int x, startx = span->startx, endx = span->endx;
4369
4370         // texture reads
4371         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4372         //unsigned char buffer_texture_refractionbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4373         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4374
4375         // varyings
4376         float ModelViewProjectionPositiondata[4];
4377         float ModelViewProjectionPositionslope[4];
4378
4379         // uniforms
4380         float ScreenScaleRefractReflect[2];
4381         float ScreenCenterRefractReflect[2];
4382         float DistortScaleRefractReflect[2];
4383         float RefractColor[4];
4384
4385         const unsigned char * RESTRICT pixelbase;
4386         const unsigned char * RESTRICT pixel[4];
4387         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4388         if(!texture) return;
4389         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[0][0];
4390
4391         // read textures
4392         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4393         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4394         //DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_refractionbgra8, GL20TU_REFRACTION, DPSOFTRAST_ARRAY_TEXCOORD1, buffer_z);
4395
4396         // read varyings
4397         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD1); // or POSITION?
4398
4399         // read uniforms
4400         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4401         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4402         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4403         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4404         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4405         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4406         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4407         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4408         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4409         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4410
4411         // do stuff
4412         for (x = startx;x < endx;x++)
4413         {
4414                 float SafeScreenTexCoord[2];
4415                 float ScreenTexCoord[2];
4416                 float v[3];
4417                 float iw;
4418                 unsigned char c[4];
4419
4420                 z = buffer_z[x];
4421
4422                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4423                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4424         
4425                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4426                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4427                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4428
4429                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4430                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4431                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4432                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4433                 DPSOFTRAST_Vector3Normalize(v);
4434                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4435                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4436
4437                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4438                 if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4439                 {
4440                         unsigned int tc[2] = { ScreenTexCoord[0] * (texture->mipmap[0][2]<<16) - 32768, ScreenTexCoord[1] * (texture->mipmap[0][3]<<16) - 32678};
4441                         unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
4442                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
4443                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
4444                         int tci[2] = { tc[0]>>16, tc[1]>>16 };
4445                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4446                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4447                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4448                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4449                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4450                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4451                         pixel[1] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci1[0]);
4452                         pixel[2] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci[0]);
4453                         pixel[3] = pixelbase + 4 * (tci1[1]*texture->mipmap[0][2]+tci1[0]);
4454                         c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
4455                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
4456                         c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
4457                 }
4458                 else
4459                 {
4460                         int tci[2] = { ScreenTexCoord[0] * texture->mipmap[0][2] - 0.5, ScreenTexCoord[1] * texture->mipmap[0][3] - 0.5 };
4461                         int tci1[2] = { tci[0] + 1, tci[1] + 1 };
4462                         tci[0] = tci[0] >= 0 ? (tci[0] <= texture->mipmap[0][2]-1 ? tci[0] : texture->mipmap[0][2]-1) : 0;
4463                         tci[1] = tci[1] >= 0 ? (tci[1] <= texture->mipmap[0][3]-1 ? tci[1] : texture->mipmap[0][3]-1) : 0;
4464                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= texture->mipmap[0][2]-1 ? tci1[0] : texture->mipmap[0][2]-1) : 0;
4465                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= texture->mipmap[0][3]-1 ? tci1[1] : texture->mipmap[0][3]-1) : 0;
4466                         pixel[0] = pixelbase + 4 * (tci[1]*texture->mipmap[0][2]+tci[0]);
4467                         c[0] = pixel[0][0];
4468                         c[1] = pixel[0][1];
4469                         c[2] = pixel[0][2];
4470                 }
4471
4472                 //p = (int) bound(startx, x + (ScreenTexCoord[0] - SafeScreenTexCoord[0]) / (ModelViewProjectionPositionslope[0]*z), endx-1);
4473                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4474                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4475                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4476                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4477         }
4478
4479         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4480 }
4481
4482
4483
4484 void DPSOFTRAST_VertexShader_Water(void)
4485 {
4486         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4487 }
4488
4489
4490 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4491 {
4492         // TODO: IMPLEMENT
4493         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4494         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4495         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4496         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4497         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4498 }
4499
4500
4501
4502 void DPSOFTRAST_VertexShader_ShowDepth(void)
4503 {
4504         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4505 }
4506
4507 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4508 {
4509         // TODO: IMPLEMENT
4510         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4511         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4512         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4513         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4514         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4515 }
4516
4517
4518
4519 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4520 {
4521         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4522 }
4523
4524 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4525 {
4526         // TODO: IMPLEMENT
4527         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4528         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4529         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4530         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4531         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4532 }
4533
4534
4535
4536 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4537 {
4538         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4539 }
4540
4541 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4542 {
4543         // TODO: IMPLEMENT
4544         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4545         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4546         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4547         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4548         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4549 }
4550
4551
4552
4553 typedef struct DPSOFTRAST_ShaderModeInfo_s
4554 {
4555         int lodarrayindex;
4556         void (*Vertex)(void);
4557         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4558         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4559         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4560 }
4561 DPSOFTRAST_ShaderModeInfo;
4562
4563 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4564 {
4565         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4566         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4567         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4568         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4569         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4570         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4571         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4572         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4573         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4574         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4575         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4576         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4577         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {~0}},
4578         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4579         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4580         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4581 };
4582
4583 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4584 {
4585         int i;
4586         int x;
4587         int startx;
4588         int endx;
4589 //      unsigned int c;
4590 //      unsigned int *colorpixel;
4591         unsigned int *depthpixel;
4592         float w;
4593         float wslope;
4594         int depth;
4595         int depthslope;
4596         unsigned int d;
4597         DPSOFTRAST_State_Triangle *triangle;
4598         DPSOFTRAST_State_Span *span;
4599         unsigned char pixelmask[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4600         for (i = 0; i < thread->numspans; i++)
4601         {
4602                 span = &thread->spans[i];
4603                 triangle = &thread->triangles[span->triangle];
4604                 if (thread->depthtest && dpsoftrast.fb_depthpixels)
4605                 {
4606                         wslope = triangle->w[0];
4607                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
4608                         depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
4609                         depth = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
4610                         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4611                         startx = span->startx;
4612                         endx = span->endx;
4613                         switch(thread->fb_depthfunc)
4614                         {
4615                         default:
4616                         case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4617                         case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4618                         case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4619                         case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4620                         case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4621                         case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4622                         case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4623                         }
4624                         //colorpixel = dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;;
4625                         //for (x = startx;x < endx;x++)
4626                         //      colorpixel[x] = (depthpixel[x] & 0xFF000000) ? (0x00FF0000) : (depthpixel[x] & 0x00FF0000);
4627                         // if there is no color buffer, skip pixel shader
4628                         while (startx < endx && !pixelmask[startx])
4629                                 startx++;
4630                         while (endx > startx && !pixelmask[endx-1])
4631                                 endx--;
4632                         if (startx >= endx)
4633                                 continue; // no pixels to fill
4634                         span->pixelmask = pixelmask;
4635                         span->startx = startx;
4636                         span->endx = endx;
4637                         // run pixel shader if appropriate
4638                         // do this before running depthmask code, to allow the pixelshader
4639                         // to clear pixelmask values for alpha testing
4640                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4641                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4642                         if (thread->depthmask)
4643                                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4644                                         if (pixelmask[x])
4645                                                 depthpixel[x] = d;
4646                 }
4647                 else
4648                 {
4649                         // no depth testing means we're just dealing with color...
4650                         // if there is no color buffer, skip pixel shader
4651                         if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4652                         {
4653                                 memset(pixelmask + span->startx, 1, span->endx - span->startx);
4654                                 span->pixelmask = pixelmask;
4655                                 DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4656                         }
4657                 }
4658         }
4659         thread->numspans = 0;
4660 }
4661
4662 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4663
4664 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4665 {
4666 #ifdef SSE2_PRESENT
4667         int cullface = thread->cullface;
4668         int minx, maxx, miny, maxy;
4669         int miny1, maxy1, miny2, maxy2;
4670         __m128i fbmin, fbmax;
4671         __m128 viewportcenter, viewportscale;
4672         int firstvertex = command->firstvertex;
4673         int numvertices = command->numvertices;
4674         int numtriangles = command->numtriangles;
4675         const int *element3i = command->element3i;
4676         const unsigned short *element3s = command->element3s;
4677         int clipped = command->clipped;
4678         int i;
4679         int j;
4680         int k;
4681         int y;
4682         int e[3];
4683         __m128i screeny;
4684         int starty, endy, bandy;
4685         int numpoints;
4686         int clipcase;
4687         float clipdist[4];
4688         __m128 triangleedge1, triangleedge2, trianglenormal;
4689         __m128 clipfrac[3];
4690         __m128 screen[4];
4691         DPSOFTRAST_State_Triangle *triangle;
4692         DPSOFTRAST_Texture *texture;
4693         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4694         miny = thread->fb_scissor[1];
4695         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4696         miny1 = bound(miny, thread->miny1, maxy);
4697         maxy1 = bound(miny, thread->maxy1, maxy);
4698         miny2 = bound(miny, thread->miny2, maxy);
4699         maxy2 = bound(miny, thread->maxy2, maxy);
4700         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4701         {
4702                 if (!ATOMIC_DECREMENT(command->refcount))
4703                 {
4704                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4705                                 MM_FREE(command->arrays);
4706                 }
4707                 return;
4708         }
4709         minx = thread->fb_scissor[0];
4710         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4711         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4712         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4713         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4714         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4715         screen[3] = _mm_setzero_ps();
4716         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4717         for (i = 0;i < numtriangles;i++)
4718         {
4719                 const float *screencoord4f = command->arrays;
4720                 const float *arrays = screencoord4f + numvertices*4;
4721
4722                 // generate the 3 edges of this triangle
4723                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4724                 if (element3s)
4725                 {
4726                         e[0] = element3s[i*3+0] - firstvertex;
4727                         e[1] = element3s[i*3+1] - firstvertex;
4728                         e[2] = element3s[i*3+2] - firstvertex;
4729                 }
4730                 else if (element3i)
4731                 {
4732                         e[0] = element3i[i*3+0] - firstvertex;
4733                         e[1] = element3i[i*3+1] - firstvertex;
4734                         e[2] = element3i[i*3+2] - firstvertex;
4735                 }
4736                 else
4737                 {
4738                         e[0] = i*3+0;
4739                         e[1] = i*3+1;
4740                         e[2] = i*3+2;
4741                 }
4742
4743 #define SKIPBACKFACE \
4744                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4745                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4746                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4747                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4748                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4749                 switch(cullface) \
4750                 { \
4751                 case GL_BACK: \
4752                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4753                                 continue; \
4754                         break; \
4755                 case GL_FRONT: \
4756                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4757                                 continue; \
4758                         break; \
4759                 }
4760
4761 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4762                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4763                         { \
4764                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4765                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4766                         }
4767 #define CLIPPEDVERTEXCOPY(k,p1) \
4768                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4769
4770 #define GENATTRIBCOPY(attrib, p1) \
4771                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4772 #define GENATTRIBLERP(attrib, p1, p2) \
4773                 { \
4774                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4775                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4776                 }
4777 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4778                 switch(clipcase) \
4779                 { \
4780                 default: \
4781                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4782                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4783                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4784                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4785                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4786                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4787                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4788                 }
4789
4790                 if (! clipped)
4791                         goto notclipped;
4792
4793                 // calculate distance from nearplane
4794                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4795                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4796                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4797                 if (clipdist[0] >= 0.0f)
4798                 {
4799                         if (clipdist[1] >= 0.0f)
4800                         {
4801                                 if (clipdist[2] >= 0.0f)
4802                                 {
4803                                 notclipped:
4804                                         // triangle is entirely in front of nearplane
4805                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4806                                         SKIPBACKFACE;
4807                                         numpoints = 3;
4808                                         clipcase = 0;
4809                                 }
4810                                 else
4811                                 {
4812                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4813                                         SKIPBACKFACE;
4814                                         numpoints = 4;
4815                                         clipcase = 1;
4816                                 }
4817                         }
4818                         else
4819                         {
4820                                 if (clipdist[2] >= 0.0f)
4821                                 {
4822                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
4823                                         SKIPBACKFACE;
4824                                         numpoints = 4;
4825                                         clipcase = 2;
4826                                 }
4827                                 else
4828                                 {
4829                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
4830                                         SKIPBACKFACE;
4831                                         numpoints = 3;
4832                                         clipcase = 3;
4833                                 }
4834                         }
4835                 }
4836                 else if (clipdist[1] >= 0.0f)
4837                 {
4838                         if (clipdist[2] >= 0.0f)
4839                         {
4840                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
4841                                 SKIPBACKFACE;
4842                                 numpoints = 4;
4843                                 clipcase = 4;
4844                         }
4845                         else
4846                         {
4847                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
4848                                 SKIPBACKFACE;
4849                                 numpoints = 3;
4850                                 clipcase = 5;
4851                         }
4852                 }
4853                 else if (clipdist[2] >= 0.0f)
4854                 {
4855                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
4856                         SKIPBACKFACE;
4857                         numpoints = 3;
4858                         clipcase = 6;
4859                 }
4860                 else continue; // triangle is entirely behind nearplane
4861
4862                 {
4863                         // calculate integer y coords for triangle points
4864                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
4865                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
4866                                         screenmin = _mm_min_epi16(screeni, screenir),
4867                                         screenmax = _mm_max_epi16(screeni, screenir);
4868                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
4869                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
4870                         screenmin = _mm_max_epi16(screenmin, fbmin);
4871                         screenmax = _mm_min_epi16(screenmax, fbmax);
4872                         // skip offscreen triangles
4873                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
4874                                 continue;
4875                         starty = _mm_extract_epi16(screenmin, 1);
4876                         endy = _mm_extract_epi16(screenmax, 1)+1;
4877                         if (starty >= maxy1 && endy <= miny2)
4878                                 continue;
4879                         screeny = _mm_srai_epi32(screeni, 16);
4880                 }
4881
4882                 triangle = &thread->triangles[thread->numtriangles];
4883
4884                 // calculate attribute plans for triangle data...
4885                 // okay, this triangle is going to produce spans, we'd better project
4886                 // the interpolants now (this is what gives perspective texturing),
4887                 // this consists of simply multiplying all arrays by the W coord
4888                 // (which is basically 1/Z), which will be undone per-pixel
4889                 // (multiplying by Z again) to get the perspective-correct array
4890                 // values
4891                 {
4892                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
4893                         __m128 mipedgescale, mipdensity;
4894                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
4895                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
4896                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
4897                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
4898                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
4899                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
4900                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
4901                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
4902                         attribedge1 = _mm_sub_ss(w0, w1);
4903                         attribedge2 = _mm_sub_ss(w2, w1);
4904                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
4905                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
4906                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
4907                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
4908                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
4909                         _mm_store_ss(&triangle->w[0], attribxslope);
4910                         _mm_store_ss(&triangle->w[1], attribyslope);
4911                         _mm_store_ss(&triangle->w[2], attriborigin);
4912                         mipedgescale = _mm_setzero_ps();
4913                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
4914                         {
4915                                 __m128 attrib0, attrib1, attrib2;
4916                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
4917                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
4918                                         break;
4919                                 arrays += numvertices*4;
4920                                 GENATTRIBS(attrib0, attrib1, attrib2);
4921                                 attriborigin = _mm_mul_ps(attrib1, w1);
4922                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
4923                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
4924                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
4925                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
4926                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
4927                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
4928                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
4929                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
4930                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
4931                                 {
4932                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
4933                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
4934                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
4935                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
4936                                 }
4937                         }
4938
4939                         memset(triangle->mip, 0, sizeof(triangle->mip));
4940                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
4941                         {
4942                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
4943                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
4944                                         break;
4945                                 texture = thread->texbound[texunit];
4946                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
4947                                 {
4948                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
4949                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
4950                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
4951                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
4952                                         // this will be multiplied in the texturing routine by the texture resolution
4953                                         y = _mm_cvtss_si32(mipdensity);
4954                                         if (y > 0)
4955                                         {
4956                                                 y = (int)(log((float)y)*0.5f/M_LN2);
4957                                                 if (y > texture->mipmaps - 1)
4958                                                         y = texture->mipmaps - 1;
4959                                                 triangle->mip[texunit] = y;
4960                                         }
4961                                 }
4962                         }
4963                 }
4964         
4965                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
4966                 for (; y < bandy;)
4967                 {
4968                         __m128 xcoords, xslope;
4969                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
4970                         int yccmask = _mm_movemask_epi8(ycc);
4971                         int edge0p, edge0n, edge1p, edge1n;
4972                         int nexty;
4973                         if (numpoints == 4)
4974                         {
4975                                 switch(yccmask)
4976                                 {
4977                                 default:
4978                                 case 0xFFFF: /*0000*/ y = endy; continue;
4979                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
4980                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
4981                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
4982                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
4983                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
4984                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
4985                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
4986                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
4987                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
4988                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
4989                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
4990                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
4991                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
4992                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
4993                                 case 0x0000: /*1111*/ y++; continue;
4994                                 }
4995                         }
4996                         else
4997                         {
4998                                 switch(yccmask)
4999                                 {
5000                                 default:
5001                                 case 0xFFFF: /*000*/ y = endy; continue;
5002                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5003                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5004                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5005                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5006                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5007                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5008                                 case 0x0000: /*111*/ y++; continue;
5009                                 }
5010                         }
5011                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5012                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5013                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5014                         nexty = _mm_extract_epi16(ycc, 0);
5015                         if (nexty >= bandy) nexty = bandy-1;
5016                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5017                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5018                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5019                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5020                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5021                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5022                         {
5023                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5024                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5025                         }
5026                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope))
5027                         {
5028                                 int startx, endx, offset;
5029                                 startx = _mm_cvtss_si32(xcoords);
5030                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5031                                 if (startx < minx) 
5032                                 {
5033                                         if (startx < 0) startx = 0;
5034                                         startx += (minx-startx)&~(DPSOFTRAST_DRAW_MAXSPANLENGTH-1);
5035                                 }
5036                                 if (endx > maxx) endx = maxx;
5037                                 if (startx >= endx) continue;
5038                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5039                                 {
5040                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5041                                         span->triangle = thread->numtriangles;
5042                                         span->x = offset;
5043                                         span->y = y;
5044                                         span->startx = max(minx - offset, 0);
5045                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5046                                         if (span->startx >= span->endx)
5047                                                 continue; 
5048                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5049                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5050                                 }
5051                         }
5052                 }
5053
5054                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5055                 {
5056                         DPSOFTRAST_Draw_ProcessSpans(thread);
5057                         thread->numtriangles = 0;
5058                 }
5059         }
5060
5061         if (!ATOMIC_DECREMENT(command->refcount))
5062         {
5063                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5064                         MM_FREE(command->arrays);
5065         }
5066
5067         if (thread->numspans > 0 || thread->numtriangles > 0)
5068         {
5069                 DPSOFTRAST_Draw_ProcessSpans(thread);
5070                 thread->numtriangles = 0;
5071         }
5072 #endif
5073 }
5074
5075 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5076 {
5077         int i;
5078         int j;
5079         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5080         int datasize = 2*numvertices*sizeof(float[4]);
5081         DPSOFTRAST_Command_Draw *command;
5082         unsigned char *data;
5083         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5084         {
5085                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5086                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5087                         break;
5088                 datasize += numvertices*sizeof(float[4]);
5089         }
5090         if (element3s)
5091                 datasize += numtriangles*sizeof(unsigned short[3]);
5092         else if (element3i)
5093                 datasize += numtriangles*sizeof(int[3]);
5094         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5095         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5096         {
5097                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5098                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5099         }
5100         else
5101         {
5102                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5103                 data = (unsigned char *)command + commandsize;
5104         }
5105         command->firstvertex = firstvertex;
5106         command->numvertices = numvertices;
5107         command->numtriangles = numtriangles;
5108         command->arrays = (float *)data;
5109         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5110         dpsoftrast.firstvertex = firstvertex;
5111         dpsoftrast.numvertices = numvertices;
5112         dpsoftrast.screencoord4f = (float *)data;
5113         data += numvertices*sizeof(float[4]);
5114         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5115         data += numvertices*sizeof(float[4]);
5116         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5117         {
5118                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5119                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5120                         break;
5121                 dpsoftrast.post_array4f[j] = (float *)data;
5122                 data += numvertices*sizeof(float[4]);
5123         }
5124         command->element3i = NULL;
5125         command->element3s = NULL;
5126         if (element3s)
5127         {
5128                 command->element3s = (unsigned short *)data;
5129                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5130         }
5131         else if (element3i)
5132         {
5133                 command->element3i = (int *)data;
5134                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5135         }
5136         return command;
5137 }
5138
5139 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5140 {
5141         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5142         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5143         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5144         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5145         if (command->starty >= command->endy)
5146         {
5147                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5148                         MM_FREE(command->arrays);
5149                 DPSOFTRAST_UndoCommand(command->commandsize);
5150                 return;
5151         }
5152         command->clipped = dpsoftrast.drawclipped;
5153         command->refcount = dpsoftrast.numthreads;
5154
5155         if (dpsoftrast.usethreads)
5156         {
5157                 int i;
5158                 DPSOFTRAST_Draw_SyncCommands();
5159                 for (i = 0; i < dpsoftrast.numthreads; i++)
5160                 {
5161                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5162                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5163                                 Thread_CondSignal(thread->drawcond);
5164                 }
5165         }
5166         else
5167         {
5168                 DPSOFTRAST_Draw_FlushThreads();
5169         }
5170 }
5171  
5172 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5173 {
5174         int commandoffset = thread->commandoffset;
5175         while (commandoffset != endoffset)
5176         {
5177                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5178                 switch (command->opcode)
5179                 {
5180 #define INTERPCOMMAND(name) \
5181                 case DPSOFTRAST_OPCODE_##name : \
5182                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5183                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5184                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5185                                 commandoffset = 0; \
5186                         break;
5187                 INTERPCOMMAND(Viewport)
5188                 INTERPCOMMAND(ClearColor)
5189                 INTERPCOMMAND(ClearDepth)
5190                 INTERPCOMMAND(ColorMask)
5191                 INTERPCOMMAND(DepthTest)
5192                 INTERPCOMMAND(ScissorTest)
5193                 INTERPCOMMAND(Scissor)
5194                 INTERPCOMMAND(BlendFunc)
5195                 INTERPCOMMAND(BlendSubtract)
5196                 INTERPCOMMAND(DepthMask)
5197                 INTERPCOMMAND(DepthFunc)
5198                 INTERPCOMMAND(DepthRange)
5199                 INTERPCOMMAND(PolygonOffset)
5200                 INTERPCOMMAND(CullFace)
5201                 INTERPCOMMAND(AlphaTest)
5202                 INTERPCOMMAND(AlphaFunc)
5203                 INTERPCOMMAND(SetTexture)
5204                 INTERPCOMMAND(SetShader)
5205                 INTERPCOMMAND(Uniform4f)
5206                 INTERPCOMMAND(UniformMatrix4f)
5207                 INTERPCOMMAND(Uniform1i)
5208
5209                 case DPSOFTRAST_OPCODE_Draw:
5210                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5211                         commandoffset += command->commandsize;
5212                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5213                                 commandoffset = 0;
5214                         thread->commandoffset = commandoffset;
5215                         break;
5216
5217                 case DPSOFTRAST_OPCODE_Reset:
5218                         commandoffset = 0;
5219                         break;
5220                 }
5221         }
5222         thread->commandoffset = commandoffset;
5223 }
5224
5225 static int DPSOFTRAST_Draw_Thread(void *data)
5226 {
5227         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5228         while(thread->index >= 0)
5229         {
5230                 if (thread->commandoffset != dpsoftrast.drawcommand)
5231                 {
5232                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5233                 }
5234                 else 
5235                 {
5236                         Thread_LockMutex(thread->drawmutex);
5237                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5238                         {
5239                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5240                                 thread->starving = true;
5241                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5242                                 thread->starving = false;
5243                         }
5244                         Thread_UnlockMutex(thread->drawmutex);
5245                 }
5246         }   
5247         return 0;
5248 }
5249
5250 static void DPSOFTRAST_Draw_FlushThreads(void)
5251 {
5252         DPSOFTRAST_State_Thread *thread;
5253         int i;
5254         DPSOFTRAST_Draw_SyncCommands();
5255         if (dpsoftrast.usethreads) 
5256         {
5257                 for (i = 0; i < dpsoftrast.numthreads; i++)
5258                 {
5259                         thread = &dpsoftrast.threads[i];
5260                         if (thread->commandoffset != dpsoftrast.drawcommand)
5261                         {
5262                                 Thread_LockMutex(thread->drawmutex);
5263                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5264                                         Thread_CondSignal(thread->drawcond);
5265                                 Thread_UnlockMutex(thread->drawmutex);
5266                         }
5267                 }
5268                 for (i = 0; i < dpsoftrast.numthreads; i++)
5269                 {
5270                         thread = &dpsoftrast.threads[i];
5271                         if (thread->commandoffset != dpsoftrast.drawcommand)
5272                         {
5273                                 Thread_LockMutex(thread->drawmutex);
5274                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5275                                 {
5276                                         thread->waiting = true;
5277                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5278                                         thread->waiting = false;
5279                                 }
5280                                 Thread_UnlockMutex(thread->drawmutex);
5281                         }
5282                 }
5283         }
5284         else
5285         {
5286                 for (i = 0; i < dpsoftrast.numthreads; i++)
5287                 {
5288                         thread = &dpsoftrast.threads[i];
5289                         if (thread->commandoffset != dpsoftrast.drawcommand)
5290                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5291                 }
5292         }
5293         dpsoftrast.commandpool.usedcommands = 0;
5294 }
5295
5296 void DPSOFTRAST_Flush(void)
5297 {
5298         DPSOFTRAST_Draw_FlushThreads();
5299 }
5300
5301 void DPSOFTRAST_Finish(void)
5302 {
5303         DPSOFTRAST_Flush();
5304 }
5305
5306 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5307 {
5308         int i;
5309         union
5310         {
5311                 int i;
5312                 unsigned char b[4];
5313         }
5314         u;
5315         u.i = 1;
5316         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5317         dpsoftrast.bigendian = u.b[3];
5318         dpsoftrast.fb_width = width;
5319         dpsoftrast.fb_height = height;
5320         dpsoftrast.fb_depthpixels = depthpixels;
5321         dpsoftrast.fb_colorpixels[0] = colorpixels;
5322         dpsoftrast.fb_colorpixels[1] = NULL;
5323         dpsoftrast.fb_colorpixels[1] = NULL;
5324         dpsoftrast.fb_colorpixels[1] = NULL;
5325         dpsoftrast.viewport[0] = 0;
5326         dpsoftrast.viewport[1] = 0;
5327         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5328         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5329         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5330         dpsoftrast.texture_firstfree = 1;
5331         dpsoftrast.texture_end = 1;
5332         dpsoftrast.texture_max = 0;
5333         dpsoftrast.color[0] = 1;
5334         dpsoftrast.color[1] = 1;
5335         dpsoftrast.color[2] = 1;
5336         dpsoftrast.color[3] = 1;
5337         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5338         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5339         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5340         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5341         for (i = 0; i < dpsoftrast.numthreads; i++)
5342         {
5343                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5344                 thread->index = i;
5345                 thread->cullface = GL_BACK;
5346                 thread->colormask[1] = 1;
5347                 thread->colormask[2] = 1;
5348                 thread->colormask[3] = 1;
5349                 thread->blendfunc[0] = GL_ONE;
5350                 thread->blendfunc[1] = GL_ZERO;
5351                 thread->depthmask = true;
5352                 thread->depthtest = true;
5353                 thread->depthfunc = GL_LEQUAL;
5354                 thread->scissortest = false;
5355                 thread->alphatest = false;
5356                 thread->alphafunc = GL_GREATER;
5357                 thread->alphavalue = 0.5f;
5358                 thread->viewport[0] = 0;
5359                 thread->viewport[1] = 0;
5360                 thread->viewport[2] = dpsoftrast.fb_width;
5361                 thread->viewport[3] = dpsoftrast.fb_height;
5362                 thread->scissor[0] = 0;
5363                 thread->scissor[1] = 0;
5364                 thread->scissor[2] = dpsoftrast.fb_width;
5365                 thread->scissor[3] = dpsoftrast.fb_height;
5366                 thread->depthrange[0] = 0;
5367                 thread->depthrange[1] = 1;
5368                 thread->polygonoffset[0] = 0;
5369                 thread->polygonoffset[1] = 0;
5370         
5371                 if (dpsoftrast.interlace)
5372                 {
5373                         thread->miny1 = (i*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5374                         thread->maxy1 = ((i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5375                         thread->miny2 = ((dpsoftrast.numthreads+i)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5376                         thread->maxy2 = ((dpsoftrast.numthreads+i+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
5377                 }
5378                 else
5379                 {
5380                         thread->miny1 = thread->miny2 = (i*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5381                         thread->maxy1 = thread->maxy2 = ((i+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
5382                 }
5383
5384                 thread->numspans = 0;
5385                 thread->numtriangles = 0;
5386                 thread->commandoffset = 0;
5387                 thread->waiting = false;
5388                 thread->starving = false;
5389            
5390                 thread->validate = -1;
5391                 DPSOFTRAST_Validate(thread, -1);
5392  
5393                 if (dpsoftrast.usethreads)
5394                 {
5395                         thread->waitcond = Thread_CreateCond();
5396                         thread->drawcond = Thread_CreateCond();
5397                         thread->drawmutex = Thread_CreateMutex();
5398                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5399                 }
5400         }
5401         return 0;
5402 }
5403
5404 void DPSOFTRAST_Shutdown(void)
5405 {
5406         int i;
5407         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5408         {
5409                 DPSOFTRAST_State_Thread *thread;
5410                 for (i = 0; i < dpsoftrast.numthreads; i++)
5411                 {
5412                         thread = &dpsoftrast.threads[i];
5413                         Thread_LockMutex(thread->drawmutex);
5414                         thread->index = -1;
5415                         Thread_CondSignal(thread->drawcond);
5416                         Thread_UnlockMutex(thread->drawmutex);
5417                         Thread_WaitThread(thread->thread, 0);
5418                         Thread_DestroyCond(thread->waitcond);
5419                         Thread_DestroyCond(thread->drawcond);
5420                         Thread_DestroyMutex(thread->drawmutex);
5421                 }
5422         }
5423         for (i = 0;i < dpsoftrast.texture_end;i++)
5424                 if (dpsoftrast.texture[i].bytes)
5425                         MM_FREE(dpsoftrast.texture[i].bytes);
5426         if (dpsoftrast.texture)
5427                 free(dpsoftrast.texture);
5428         if (dpsoftrast.threads)
5429                 MM_FREE(dpsoftrast.threads);
5430         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5431 }
5432