]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
0e125a915363c6918f4c019ffb31e71d81856742
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int viewport[4];
253         int scissor[4];
254         float depthrange[2];
255         float polygonoffset[2];
256         float clipplane[4];
257         ALIGN(float fb_clipplane[4]);
258
259         int shader_mode;
260         int shader_permutation;
261         int shader_exactspecularmath;
262
263         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
264         
265         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
267
268         // DPSOFTRAST_VALIDATE_ flags
269         int validate;
270
271         // derived values (DPSOFTRAST_VALIDATE_FB)
272         int fb_colormask;
273         int fb_scissor[4];
274         ALIGN(float fb_viewportcenter[4]);
275         ALIGN(float fb_viewportscale[4]);
276
277         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278         int fb_depthfunc;
279
280         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
281         int fb_blendmode;
282
283         // band boundaries
284         int miny1;
285         int maxy1;
286         int miny2;
287         int maxy2;
288
289         ATOMIC(volatile int commandoffset);
290
291         volatile bool waiting;
292         volatile bool starving;
293         void *waitcond;
294         void *drawcond;
295         void *drawmutex;
296
297         int numspans;
298         int numtriangles;
299         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
302 }
303 DPSOFTRAST_State_Thread);
304
305 typedef ALIGN(struct DPSOFTRAST_State_s
306 {
307         int fb_width;
308         int fb_height;
309         unsigned int *fb_depthpixels;
310         unsigned int *fb_colorpixels[4];
311
312         int viewport[4];
313         ALIGN(float fb_viewportcenter[4]);
314         ALIGN(float fb_viewportscale[4]);
315
316         float color[4];
317         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
319
320         const float *pointer_vertex3f;
321         const float *pointer_color4f;
322         const unsigned char *pointer_color4ub;
323         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324         int stride_vertex;
325         int stride_color;
326         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
329
330         int firstvertex;
331         int numvertices;
332         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333         float *screencoord4f;
334         int drawstarty;
335         int drawendy;
336         int drawclipped;
337         
338         int shader_mode;
339         int shader_permutation;
340         int shader_exactspecularmath;
341
342         int texture_max;
343         int texture_end;
344         int texture_firstfree;
345         DPSOFTRAST_Texture *texture;
346
347         int bigendian;
348
349         // error reporting
350         const char *errorstring;
351
352         bool usethreads;
353         int interlace;
354         int numthreads;
355         DPSOFTRAST_State_Thread *threads;
356
357         ATOMIC(volatile int drawcommand);
358
359         DPSOFTRAST_State_Command_Pool commandpool;
360 }
361 DPSOFTRAST_State);
362
363 DPSOFTRAST_State dpsoftrast;
364
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
369
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
372
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
374 {
375         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377         fb_viewportcenter[3] = 0.5f;
378         fb_viewportcenter[0] = 0.0f;
379         fb_viewportscale[1] = 0.5f * viewport[2];
380         fb_viewportscale[2] = -0.5f * viewport[3];
381         fb_viewportscale[3] = 0.5f;
382         fb_viewportscale[0] = 1.0f;
383 }
384
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
386 {
387         if (dpsoftrast.interlace)
388         {
389                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393         }
394         else
395         {
396                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
398         }
399 }
400
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
402 {
403         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 }
409
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
411 {
412         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413         // and viewport projection values
414         int x1, x2;
415         int y1, y2;
416         x1 = thread->scissor[0];
417         x2 = thread->scissor[0] + thread->scissor[2];
418         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419         y2 = dpsoftrast.fb_height - thread->scissor[1];
420         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
421         if (x1 < 0) x1 = 0;
422         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
423         if (y1 < 0) y1 = 0;
424         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425         thread->fb_scissor[0] = x1;
426         thread->fb_scissor[1] = y1;
427         thread->fb_scissor[2] = x2 - x1;
428         thread->fb_scissor[3] = y2 - y1;
429
430         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431         DPSOFTRAST_RecalcClipPlane(thread);
432         DPSOFTRAST_RecalcThread(thread);
433 }
434
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
436 {
437         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 }
439
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
441 {
442         if (thread->blendsubtract)
443         {
444                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445                 {
446                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450                 }
451         }
452         else
453         {       
454                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
455                 {
456                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
467                 }
468         }
469 }
470
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
472
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
474 {
475         mask &= thread->validate;
476         if (!mask)
477                 return;
478         if (mask & DPSOFTRAST_VALIDATE_FB)
479         {
480                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481                 DPSOFTRAST_RecalcFB(thread);
482         }
483         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
484         {
485                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486                 DPSOFTRAST_RecalcDepthFunc(thread);
487         }
488         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
489         {
490                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491                 DPSOFTRAST_RecalcBlendFunc(thread);
492         }
493 }
494
495 DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
496 {
497         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498                 return &dpsoftrast.texture[index];
499         return NULL;
500 }
501
502 static void DPSOFTRAST_Texture_Grow(void)
503 {
504         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505         DPSOFTRAST_State_Thread *thread;
506         int i;
507         int j;
508         DPSOFTRAST_Flush();
509         // expand texture array as needed
510         if (dpsoftrast.texture_max < 1024)
511                 dpsoftrast.texture_max = 1024;
512         else
513                 dpsoftrast.texture_max *= 2;
514         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516                 if (dpsoftrast.texbound[i])
517                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518         for (j = 0; j < dpsoftrast.numthreads; j++)
519         {
520                 thread = &dpsoftrast.threads[j];
521                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522                         if (thread->texbound[i])
523                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
524         }
525 }
526
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
528 {
529         int w;
530         int h;
531         int d;
532         int size;
533         int s;
534         int texnum;
535         int mipmaps;
536         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538         DPSOFTRAST_Texture *texture;
539         if (width*height*depth < 1)
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542                 return 0;
543         }
544         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
545         {
546                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
547                 return 0;
548         }
549         switch(texformat)
550         {
551         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
554                 break;
555         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
557                 {
558                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
559                         return 0;
560                 }
561                 if (depth != 1)
562                 {
563                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564                         return 0;
565                 }
566                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
567                 {
568                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
569                         return 0;
570                 }
571                 break;
572         }
573         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576                 return 0;
577         }
578         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581                 return 0;
582         }
583         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
584         {
585                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586                 return 0;
587         }
588         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
589         {
590                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591                 return 0;
592         }
593         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
594         {
595                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596                 return 0;
597         }
598         // find first empty slot in texture array
599         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600                 if (!dpsoftrast.texture[texnum].bytes)
601                         break;
602         dpsoftrast.texture_firstfree = texnum + 1;
603         if (dpsoftrast.texture_max <= texnum)
604                 DPSOFTRAST_Texture_Grow();
605         if (dpsoftrast.texture_end <= texnum)
606                 dpsoftrast.texture_end = texnum + 1;
607         texture = &dpsoftrast.texture[texnum];
608         memset(texture, 0, sizeof(*texture));
609         texture->flags = flags;
610         texture->width = width;
611         texture->height = height;
612         texture->depth = depth;
613         texture->sides = sides;
614         texture->binds = 0;
615         w = width;
616         h = height;
617         d = depth;
618         size = 0;
619         mipmaps = 0;
620         w = width;
621         h = height;
622         d = depth;
623         for (;;)
624         {
625                 s = w * h * d * sides * 4;
626                 texture->mipmap[mipmaps][0] = size;
627                 texture->mipmap[mipmaps][1] = s;
628                 texture->mipmap[mipmaps][2] = w;
629                 texture->mipmap[mipmaps][3] = h;
630                 texture->mipmap[mipmaps][4] = d;
631                 size += s;
632                 mipmaps++;
633                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
634                         break;
635                 if (w > 1) w >>= 1;
636                 if (h > 1) h >>= 1;
637                 if (d > 1) d >>= 1;
638         }
639         texture->mipmaps = mipmaps;
640         texture->size = size;
641
642         // allocate the pixels now
643         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
644
645         return texnum;
646 }
647 void DPSOFTRAST_Texture_Free(int index)
648 {
649         DPSOFTRAST_Texture *texture;
650         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
651         if (texture->binds)
652                 DPSOFTRAST_Flush();
653         if (texture->bytes)
654                 MM_FREE(texture->bytes);
655         texture->bytes = NULL;
656         memset(texture, 0, sizeof(*texture));
657         // adjust the free range and used range
658         if (dpsoftrast.texture_firstfree > index)
659                 dpsoftrast.texture_firstfree = index;
660         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661                 dpsoftrast.texture_end--;
662 }
663 void DPSOFTRAST_Texture_CalculateMipmaps(int index)
664 {
665         int i, x, y, z, w, layer0, layer1, row0, row1;
666         unsigned char *o, *i0, *i1, *i2, *i3;
667         DPSOFTRAST_Texture *texture;
668         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669         if (texture->mipmaps <= 1)
670                 return;
671         for (i = 1;i < texture->mipmaps;i++)
672         {
673                 for (z = 0;z < texture->mipmap[i][4];z++)
674                 {
675                         layer0 = z*2;
676                         layer1 = z*2+1;
677                         if (layer1 >= texture->mipmap[i-1][4])
678                                 layer1 = texture->mipmap[i-1][4]-1;
679                         for (y = 0;y < texture->mipmap[i][3];y++)
680                         {
681                                 row0 = y*2;
682                                 row1 = y*2+1;
683                                 if (row1 >= texture->mipmap[i-1][3])
684                                         row1 = texture->mipmap[i-1][3]-1;
685                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
686                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690                                 w = texture->mipmap[i][2];
691                                 if (layer1 > layer0)
692                                 {
693                                         if (texture->mipmap[i-1][2] > 1)
694                                         {
695                                                 // average 3D texture
696                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
697                                                 {
698                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
702                                                 }
703                                         }
704                                         else
705                                         {
706                                                 // average 3D mipmap with parent width == 1
707                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
708                                                 {
709                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
713                                                 }
714                                         }
715                                 }
716                                 else
717                                 {
718                                         if (texture->mipmap[i-1][2] > 1)
719                                         {
720                                                 // average 2D texture (common case)
721                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
722                                                 {
723                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
727                                                 }
728                                         }
729                                         else
730                                         {
731                                                 // 2D texture with parent width == 1
732                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
733                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
734                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
735                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
736                                         }
737                                 }
738                         }
739                 }
740         }
741 }
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
743 {
744         DPSOFTRAST_Texture *texture;
745         unsigned char *dst;
746         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
747         if (texture->binds)
748                 DPSOFTRAST_Flush();
749         if (pixels)
750         {
751                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
752                 while (blockheight > 0)
753                 {
754                         memcpy(dst, pixels, blockwidth * 4);
755                         pixels += blockwidth * 4;
756                         dst += texture->mipmap[0][2] * 4;
757                         blockheight--;
758                 }
759         }
760         DPSOFTRAST_Texture_CalculateMipmaps(index);
761 }
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
763 {
764         DPSOFTRAST_Texture *texture;
765         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
766         if (texture->binds)
767                 DPSOFTRAST_Flush();
768         if (pixels)
769                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
770         DPSOFTRAST_Texture_CalculateMipmaps(index);
771 }
772 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
773 {
774         DPSOFTRAST_Texture *texture;
775         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
776         return texture->mipmap[mip][2];
777 }
778 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
779 {
780         DPSOFTRAST_Texture *texture;
781         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782         return texture->mipmap[mip][3];
783 }
784 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
785 {
786         DPSOFTRAST_Texture *texture;
787         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788         return texture->mipmap[mip][4];
789 }
790 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
791 {
792         DPSOFTRAST_Texture *texture;
793         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
794         if (texture->binds)
795                 DPSOFTRAST_Flush();
796         return texture->bytes + texture->mipmap[mip][0];
797 }
798 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
799 {
800         DPSOFTRAST_Texture *texture;
801         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
802         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
803         {
804                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
805                 return;
806         }
807         if (texture->binds)
808                 DPSOFTRAST_Flush();
809         texture->filter = filter;
810 }
811
812 static void DPSOFTRAST_Draw_FlushThreads(void);
813
814 static void DPSOFTRAST_Draw_SyncCommands(void)
815 {
816         if(dpsoftrast.usethreads) MEMORY_BARRIER;
817         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
818 }
819
820 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
821 {
822         DPSOFTRAST_State_Thread *thread;
823         int i;
824         int freecommand = dpsoftrast.commandpool.freecommand;
825         int usedcommands = dpsoftrast.commandpool.usedcommands;
826         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
827                 return;
828         DPSOFTRAST_Draw_SyncCommands();
829         for(;;)
830         {
831                 int waitindex = -1;
832                 int commandoffset;
833                 usedcommands = 0;
834                 for (i = 0; i < dpsoftrast.numthreads; i++)
835                 {
836                         thread = &dpsoftrast.threads[i]; 
837                         commandoffset = freecommand - thread->commandoffset;
838                         if (commandoffset < 0)
839                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
840                         if (commandoffset > usedcommands)
841                         {
842                                 waitindex = i;
843                                 usedcommands = commandoffset;
844                         }
845                 }
846                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
847                         break;
848                 thread = &dpsoftrast.threads[waitindex];
849                 Thread_LockMutex(thread->drawmutex);
850                 if (thread->commandoffset != dpsoftrast.drawcommand)
851                 {
852                         thread->waiting = true;
853                         if (thread->starving) Thread_CondSignal(thread->drawcond);
854                         Thread_CondWait(thread->waitcond, thread->drawmutex);
855                         thread->waiting = false;
856                 }
857                 Thread_UnlockMutex(thread->drawmutex);
858         }
859         dpsoftrast.commandpool.usedcommands = usedcommands;
860 }
861
862 #define DPSOFTRAST_ALIGNCOMMAND(size) \
863         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
864 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
865         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
866
867 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
868 {
869         DPSOFTRAST_Command *command;
870         int freecommand = dpsoftrast.commandpool.freecommand;
871         int usedcommands = dpsoftrast.commandpool.usedcommands;
872         int extra = sizeof(DPSOFTRAST_Command);
873         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
874                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
875         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
876         {
877                 if (dpsoftrast.usethreads)
878                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
879                 else
880                         DPSOFTRAST_Draw_FlushThreads();
881                 freecommand = dpsoftrast.commandpool.freecommand;
882                 usedcommands = dpsoftrast.commandpool.usedcommands;
883         }
884         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
885         {
886                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
887                 command->opcode = DPSOFTRAST_OPCODE_Reset;
888                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
889                 freecommand = 0;
890         }
891         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
892         command->opcode = opcode;
893         command->commandsize = size;
894         freecommand += size;
895         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
896                 freecommand = 0;
897         dpsoftrast.commandpool.freecommand = freecommand;
898         dpsoftrast.commandpool.usedcommands = usedcommands + size;
899         return command;
900 }
901
902 static void DPSOFTRAST_UndoCommand(int size)
903 {
904         int freecommand = dpsoftrast.commandpool.freecommand;
905         int usedcommands = dpsoftrast.commandpool.usedcommands;
906         freecommand -= size;
907         if (freecommand < 0)
908                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
909         usedcommands -= size;
910         dpsoftrast.commandpool.freecommand = freecommand;
911         dpsoftrast.commandpool.usedcommands = usedcommands;
912 }
913                 
914 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
915 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
916 {
917         thread->viewport[0] = command->x;
918         thread->viewport[1] = command->y;
919         thread->viewport[2] = command->width;
920         thread->viewport[3] = command->height;
921         thread->validate |= DPSOFTRAST_VALIDATE_FB;
922 }
923 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
924 {
925         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
926         command->x = x;
927         command->y = y;
928         command->width = width;
929         command->height = height;
930
931         dpsoftrast.viewport[0] = x;
932         dpsoftrast.viewport[1] = y;
933         dpsoftrast.viewport[2] = width;
934         dpsoftrast.viewport[3] = height;
935         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
936 }
937
938 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
939 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
940 {
941         int i, x1, y1, x2, y2, w, h, x, y;
942         int miny1, maxy1, miny2, maxy2;
943         int bandy;
944         unsigned int *p;
945         unsigned int c;
946         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
947         miny1 = thread->miny1;
948         maxy1 = thread->maxy1;
949         miny2 = thread->miny2;
950         maxy2 = thread->maxy2;
951         x1 = thread->fb_scissor[0];
952         y1 = thread->fb_scissor[1];
953         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955         if (y1 < miny1) y1 = miny1;
956         if (y2 > maxy2) y2 = maxy2;
957         w = x2 - x1;
958         h = y2 - y1;
959         if (w < 1 || h < 1)
960                 return;
961         // FIXME: honor fb_colormask?
962         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
963         for (i = 0;i < 4;i++)
964         {
965                 if (!dpsoftrast.fb_colorpixels[i])
966                         continue;
967                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
968                 for (;y < bandy;y++)
969                 {
970                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
971                         for (x = x1;x < x2;x++)
972                                 p[x] = c;
973                 }
974         }
975 }
976 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
977 {
978         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
979         command->r = r;
980         command->g = g;
981         command->b = b;
982         command->a = a;
983 }
984
985 DEFCOMMAND(3, ClearDepth, float depth;)
986 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
987 {
988         int x1, y1, x2, y2, w, h, x, y;
989         int miny1, maxy1, miny2, maxy2;
990         int bandy;
991         unsigned int *p;
992         unsigned int c;
993         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
994         miny1 = thread->miny1;
995         maxy1 = thread->maxy1;
996         miny2 = thread->miny2;
997         maxy2 = thread->maxy2;
998         x1 = thread->fb_scissor[0];
999         y1 = thread->fb_scissor[1];
1000         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1001         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1002         if (y1 < miny1) y1 = miny1;
1003         if (y2 > maxy2) y2 = maxy2;
1004         w = x2 - x1;
1005         h = y2 - y1;
1006         if (w < 1 || h < 1)
1007                 return;
1008         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1009         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1010         for (;y < bandy;y++)
1011         {
1012                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1013                 for (x = x1;x < x2;x++)
1014                         p[x] = c;
1015         }
1016 }
1017 void DPSOFTRAST_ClearDepth(float d)
1018 {
1019         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1020         command->depth = d;
1021 }
1022
1023 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1024 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1025 {
1026         thread->colormask[0] = command->r != 0;
1027         thread->colormask[1] = command->g != 0;
1028         thread->colormask[2] = command->b != 0;
1029         thread->colormask[3] = command->a != 0;
1030         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1031 }
1032 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1033 {
1034         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1035         command->r = r;
1036         command->g = g;
1037         command->b = b;
1038         command->a = a;
1039 }
1040
1041 DEFCOMMAND(5, DepthTest, int enable;)
1042 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1043 {
1044         thread->depthtest = command->enable;
1045         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1046 }
1047 void DPSOFTRAST_DepthTest(int enable)
1048 {
1049         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1050         command->enable = enable;
1051 }
1052
1053 DEFCOMMAND(6, ScissorTest, int enable;)
1054 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1055 {
1056         thread->scissortest = command->enable;
1057         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1058 }
1059 void DPSOFTRAST_ScissorTest(int enable)
1060 {
1061         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1062         command->enable = enable;
1063 }
1064
1065 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1066 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1067 {
1068         thread->scissor[0] = command->x;
1069         thread->scissor[1] = command->y;
1070         thread->scissor[2] = command->width;
1071         thread->scissor[3] = command->height;
1072         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1073 }
1074 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1075 {
1076         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1077         command->x = x;
1078         command->y = y;
1079         command->width = width;
1080         command->height = height;
1081 }
1082
1083 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1084 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1085 {
1086         thread->blendfunc[0] = command->sfactor;
1087         thread->blendfunc[1] = command->dfactor;
1088         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1089 }
1090 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1091 {
1092         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1093         command->sfactor = sfactor;
1094         command->dfactor = dfactor;
1095 }
1096
1097 DEFCOMMAND(9, BlendSubtract, int enable;)
1098 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1099 {
1100         thread->blendsubtract = command->enable;
1101         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1102 }
1103 void DPSOFTRAST_BlendSubtract(int enable)
1104 {
1105         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1106         command->enable = enable;
1107 }
1108
1109 DEFCOMMAND(10, DepthMask, int enable;)
1110 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1111 {
1112         thread->depthmask = command->enable;
1113 }
1114 void DPSOFTRAST_DepthMask(int enable)
1115 {
1116         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1117         command->enable = enable;
1118 }
1119
1120 DEFCOMMAND(11, DepthFunc, int func;)
1121 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1122 {
1123         thread->depthfunc = command->func;
1124 }
1125 void DPSOFTRAST_DepthFunc(int func)
1126 {
1127         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1128         command->func = func;
1129 }
1130
1131 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1132 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1133 {
1134         thread->depthrange[0] = command->nearval;
1135         thread->depthrange[1] = command->farval;
1136 }
1137 void DPSOFTRAST_DepthRange(float nearval, float farval)
1138 {
1139         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1140         command->nearval = nearval;
1141         command->farval = farval;
1142 }
1143
1144 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1145 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1146 {
1147         thread->polygonoffset[0] = command->alongnormal;
1148         thread->polygonoffset[1] = command->intoview;
1149 }
1150 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1151 {
1152         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1153         command->alongnormal = alongnormal;
1154         command->intoview = intoview;
1155 }
1156
1157 DEFCOMMAND(14, CullFace, int mode;)
1158 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1159 {
1160         thread->cullface = command->mode;
1161 }
1162 void DPSOFTRAST_CullFace(int mode)
1163 {
1164         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1165         command->mode = mode;
1166 }
1167
1168 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1169 {
1170         dpsoftrast.color[0] = r;
1171         dpsoftrast.color[1] = g;
1172         dpsoftrast.color[2] = b;
1173         dpsoftrast.color[3] = a;
1174 }
1175
1176 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1177 {
1178         int outstride = blockwidth * 4;
1179         int instride = dpsoftrast.fb_width * 4;
1180         int bx1 = blockx;
1181         int by1 = blocky;
1182         int bx2 = blockx + blockwidth;
1183         int by2 = blocky + blockheight;
1184         int bw;
1185         int x;
1186         int y;
1187         unsigned char *inpixels;
1188         unsigned char *b;
1189         unsigned char *o;
1190         DPSOFTRAST_Flush();
1191         if (bx1 < 0) bx1 = 0;
1192         if (by1 < 0) by1 = 0;
1193         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1194         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1195         bw = bx2 - bx1;
1196         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1197         if (dpsoftrast.bigendian)
1198         {
1199                 for (y = by1;y < by2;y++)
1200                 {
1201                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1202                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1203                         for (x = bx1;x < bx2;x++)
1204                         {
1205                                 o[0] = b[3];
1206                                 o[1] = b[2];
1207                                 o[2] = b[1];
1208                                 o[3] = b[0];
1209                                 o += 4;
1210                                 b += 4;
1211                         }
1212                 }
1213         }
1214         else
1215         {
1216                 for (y = by1;y < by2;y++)
1217                 {
1218                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1219                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1220                         memcpy(o, b, bw*4);
1221                 }
1222         }
1223
1224 }
1225 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1226 {
1227         int tx1 = tx;
1228         int ty1 = ty;
1229         int tx2 = tx + width;
1230         int ty2 = ty + height;
1231         int sx1 = sx;
1232         int sy1 = sy;
1233         int sx2 = sx + width;
1234         int sy2 = sy + height;
1235         int swidth;
1236         int sheight;
1237         int twidth;
1238         int theight;
1239         int sw;
1240         int sh;
1241         int tw;
1242         int th;
1243         int y;
1244         unsigned int *spixels;
1245         unsigned int *tpixels;
1246         DPSOFTRAST_Texture *texture;
1247         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1248         if (mip < 0 || mip >= texture->mipmaps) return;
1249         DPSOFTRAST_Flush();
1250         spixels = dpsoftrast.fb_colorpixels[0];
1251         swidth = dpsoftrast.fb_width;
1252         sheight = dpsoftrast.fb_height;
1253         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1254         twidth = texture->mipmap[mip][2];
1255         theight = texture->mipmap[mip][3];
1256         if (tx1 < 0) tx1 = 0;
1257         if (ty1 < 0) ty1 = 0;
1258         if (tx2 > twidth) tx2 = twidth;
1259         if (ty2 > theight) ty2 = theight;
1260         if (sx1 < 0) sx1 = 0;
1261         if (sy1 < 0) sy1 = 0;
1262         if (sx2 > swidth) sx2 = swidth;
1263         if (sy2 > sheight) sy2 = sheight;
1264         tw = tx2 - tx1;
1265         th = ty2 - ty1;
1266         sw = sx2 - sx1;
1267         sh = sy2 - sy1;
1268         if (tw > sw) tw = sw;
1269         if (th > sh) th = sh;
1270         if (tw < 1 || th < 1)
1271                 return;
1272         sy1 = sheight - 1 - sy1;
1273         for (y = 0;y < th;y++)
1274                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1275         if (texture->mipmaps > 1)
1276                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1277 }
1278
1279 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1280 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1281 {
1282         if (thread->texbound[command->unitnum])
1283                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1284         thread->texbound[command->unitnum] = command->texture;
1285 }
1286 void DPSOFTRAST_SetTexture(int unitnum, int index)
1287 {
1288         DPSOFTRAST_Command_SetTexture *command;
1289         DPSOFTRAST_Texture *texture;
1290         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1291         {
1292                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1293                 return;
1294         }
1295         texture = DPSOFTRAST_Texture_GetByIndex(index);
1296         if (index && !texture)
1297         {
1298                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1299                 return;
1300         }
1301
1302         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1303         command->unitnum = unitnum;
1304         command->texture = texture;
1305
1306         dpsoftrast.texbound[unitnum] = texture;
1307         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1308 }
1309
1310 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1311 {
1312         dpsoftrast.pointer_vertex3f = vertex3f;
1313         dpsoftrast.stride_vertex = stride;
1314 }
1315 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1316 {
1317         dpsoftrast.pointer_color4f = color4f;
1318         dpsoftrast.pointer_color4ub = NULL;
1319         dpsoftrast.stride_color = stride;
1320 }
1321 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1322 {
1323         dpsoftrast.pointer_color4f = NULL;
1324         dpsoftrast.pointer_color4ub = color4ub;
1325         dpsoftrast.stride_color = stride;
1326 }
1327 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1328 {
1329         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1330         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1331         dpsoftrast.stride_texcoord[unitnum] = stride;
1332 }
1333
1334 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1335 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1336 {
1337         thread->shader_mode = command->mode;
1338         thread->shader_permutation = command->permutation;
1339         thread->shader_exactspecularmath = command->exactspecularmath;
1340 }
1341 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1342 {
1343         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1344         command->mode = mode;
1345         command->permutation = permutation;
1346         command->exactspecularmath = exactspecularmath;
1347
1348         dpsoftrast.shader_mode = mode;
1349         dpsoftrast.shader_permutation = permutation;
1350         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1351 }
1352
1353 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1354 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1355 {
1356         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1357 }
1358 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1359 {
1360         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1361         command->index = index;
1362         command->val[0] = v0;
1363         command->val[1] = v1;
1364         command->val[2] = v2;
1365         command->val[3] = v3;
1366
1367         dpsoftrast.uniform4f[index*4+0] = v0;
1368         dpsoftrast.uniform4f[index*4+1] = v1;
1369         dpsoftrast.uniform4f[index*4+2] = v2;
1370         dpsoftrast.uniform4f[index*4+3] = v3;
1371 }
1372 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1373 {
1374         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1375         command->index = index;
1376         memcpy(command->val, v, sizeof(command->val));
1377
1378         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1379 }
1380
1381 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1382 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1383 {
1384         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1385 }
1386 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1387 {
1388 #ifdef SSE_POSSIBLE
1389         int i, index;
1390         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1391         {
1392                 __m128 m0, m1, m2, m3;
1393                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1394                 command->index = (DPSOFTRAST_UNIFORM)index;
1395                 if (((size_t)v)&(ALIGN_SIZE-1))
1396                 {
1397                         m0 = _mm_loadu_ps(v);
1398                         m1 = _mm_loadu_ps(v+4);
1399                         m2 = _mm_loadu_ps(v+8);
1400                         m3 = _mm_loadu_ps(v+12);
1401                 }
1402                 else
1403                 {
1404                         m0 = _mm_load_ps(v);
1405                         m1 = _mm_load_ps(v+4);
1406                         m2 = _mm_load_ps(v+8);
1407                         m3 = _mm_load_ps(v+12);
1408                 }
1409                 if (transpose)
1410                 {
1411                         __m128 t0, t1, t2, t3;
1412                         t0 = _mm_unpacklo_ps(m0, m1);
1413                         t1 = _mm_unpacklo_ps(m2, m3);
1414                         t2 = _mm_unpackhi_ps(m0, m1);
1415                         t3 = _mm_unpackhi_ps(m2, m3);
1416                         m0 = _mm_movelh_ps(t0, t1);
1417                         m1 = _mm_movehl_ps(t1, t0);
1418                         m2 = _mm_movelh_ps(t2, t3);
1419                         m3 = _mm_movehl_ps(t3, t2);                     
1420                 }
1421                 _mm_store_ps(command->val, m0);
1422                 _mm_store_ps(command->val+4, m1);
1423                 _mm_store_ps(command->val+8, m2);
1424                 _mm_store_ps(command->val+12, m3);
1425                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1426                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1427                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1428                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1429         }
1430 #endif
1431 }
1432
1433 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1434 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1435 {
1436         thread->uniform1i[command->index] = command->val;
1437 }
1438 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1439 {
1440         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1441         command->index = index;
1442         command->val = i0;
1443
1444         dpsoftrast.uniform1i[command->index] = i0;
1445 }
1446
1447 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1448 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1449 {
1450         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1451         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1452 }
1453 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1454 {
1455         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1456         command->clipplane[0] = x;
1457         command->clipplane[1] = y;
1458         command->clipplane[2] = z;
1459         command->clipplane[3] = w;
1460 }
1461
1462 #ifdef SSE_POSSIBLE
1463 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1464 {
1465         float *end = dst + size*4;
1466         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1467         {
1468                 while (dst < end)
1469                 {
1470                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1471                         dst += 4;
1472                         src += stride;
1473                 }
1474         }
1475         else
1476         {
1477                 while (dst < end)
1478                 {
1479                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1480                         dst += 4;
1481                         src += stride;
1482                 }
1483         }
1484 }
1485
1486 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1487 {
1488         float *end = dst + size*4;
1489         if (stride == sizeof(float[3]))
1490         {
1491                 float *end4 = dst + (size&~3)*4;        
1492                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1493                 {
1494                         while (dst < end4)
1495                         {
1496                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1497                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1498                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1501                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1502                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1504                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1505                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1506                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1507                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1508                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509                                 dst += 16;
1510                                 src += 4*sizeof(float[3]);
1511                         }
1512                 }
1513                 else
1514                 {
1515                         while (dst < end4)
1516                         {
1517                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1518                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1519                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1520                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1521                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1522                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1523                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1524                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1525                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1526                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1527                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1528                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1529                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530                                 dst += 16;
1531                                 src += 4*sizeof(float[3]);
1532                         }
1533                 }
1534         }
1535         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1536         {
1537                 while (dst < end)
1538                 {
1539                         __m128 v = _mm_loadu_ps((const float *)src);
1540                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1541                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1542                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1543                         _mm_store_ps(dst, v);
1544                         dst += 4;
1545                         src += stride;
1546                 }
1547         }
1548         else
1549         {
1550                 while (dst < end)
1551                 {
1552                         __m128 v = _mm_load_ps((const float *)src);
1553                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1554                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1555                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1556                         _mm_store_ps(dst, v);
1557                         dst += 4;
1558                         src += stride;
1559                 }
1560         }
1561 }
1562
1563 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1564 {
1565         float *end = dst + size*4;
1566         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1567         if (stride == sizeof(float[2]))
1568         {
1569                 float *end2 = dst + (size&~1)*4;
1570                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1571                 {
1572                         while (dst < end2)
1573                         {
1574                                 __m128 v = _mm_loadu_ps((const float *)src);
1575                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1576                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1577                                 dst += 8;
1578                                 src += 2*sizeof(float[2]);
1579                         }
1580                 }
1581                 else
1582                 {
1583                         while (dst < end2)
1584                         {
1585                                 __m128 v = _mm_load_ps((const float *)src);
1586                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1587                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1588                                 dst += 8;
1589                                 src += 2*sizeof(float[2]);
1590                         }
1591                 }
1592         }
1593         while (dst < end)
1594         {
1595                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1596                 dst += 4;
1597                 src += stride;
1598         }
1599 }
1600
1601 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1602 {
1603         float *end = dst + size*4;
1604         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1605         if (stride == sizeof(unsigned char[4]))
1606         {
1607                 float *end4 = dst + (size&~3)*4;
1608                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1609                 {
1610                         while (dst < end4)
1611                         {
1612                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1613                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1614                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1615                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1616                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1617                                 dst += 16;
1618                                 src += 4*sizeof(unsigned char[4]);
1619                         }
1620                 }
1621                 else
1622                 {
1623                         while (dst < end4)
1624                         {
1625                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1626                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1627                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1628                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1629                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1630                                 dst += 16;
1631                                 src += 4*sizeof(unsigned char[4]);
1632                         }
1633                 }
1634         }
1635         while (dst < end)
1636         {
1637                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1638                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1639                 dst += 4;
1640                 src += stride;
1641         }
1642 }
1643
1644 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1645 {
1646         float *end = dst + 4*size;
1647         __m128 v = _mm_loadu_ps(src);
1648         while (dst < end)
1649         {
1650                 _mm_store_ps(dst, v);
1651                 dst += 4;
1652         }
1653 }
1654 #endif
1655
1656 void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1657 {
1658 #ifdef SSE_POSSIBLE
1659         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1660         __m128 m0, m1, m2, m3;
1661         float *end;
1662         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1663         {
1664                 // fast case for identity matrix
1665                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1666                 return;
1667         }
1668         end = out4f + numitems*4;
1669         m0 = _mm_loadu_ps(inmatrix16f);
1670         m1 = _mm_loadu_ps(inmatrix16f + 4);
1671         m2 = _mm_loadu_ps(inmatrix16f + 8);
1672         m3 = _mm_loadu_ps(inmatrix16f + 12);
1673         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1674         {
1675                 while (out4f < end)
1676                 {
1677                         __m128 v = _mm_loadu_ps(in4f);
1678                         _mm_store_ps(out4f,
1679                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1680                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1681                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1682                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1683                         out4f += 4;
1684                         in4f += 4;
1685                 }
1686         }
1687         else
1688         {
1689                 while (out4f < end)
1690                 {
1691                         __m128 v = _mm_load_ps(in4f);
1692                         _mm_store_ps(out4f,
1693                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1694                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1695                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1696                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1697                         out4f += 4;
1698                         in4f += 4;
1699                 }
1700         }
1701 #endif
1702 }
1703
1704 void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1705 {
1706         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1707 }
1708
1709 #ifdef SSE_POSSIBLE
1710 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1711 { \
1712         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1713         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1714         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1715         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1716 }
1717
1718 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1719 { \
1720         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1721         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1722         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1723         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1724 }
1725
1726 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1727 { \
1728         __m128 p = (in); \
1729         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1730                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1731                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1732                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1733 }
1734
1735 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1736 {
1737         int clipmask = 0xFF;
1738         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1739         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1740         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1741         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1742         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1743         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1744         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1745         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1746         #define BBFRONT(k, pos) \
1747         { \
1748                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1749                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1750                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1751                 { \
1752                         __m128 proj; \
1753                         clipmask &= ~(1<<k); \
1754                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1755                         minproj = _mm_min_ss(minproj, proj); \
1756                         maxproj = _mm_max_ss(maxproj, proj); \
1757                 } \
1758         }
1759         BBFRONT(0, minpos); 
1760         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1761         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1762         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1763         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1764         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1765         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1766         BBFRONT(7, maxpos);
1767         #define BBCLIP(k) \
1768         { \
1769                 if (clipmask&(1<<k)) \
1770                 { \
1771                         if (!(clipmask&(1<<(k^1)))) \
1772                         { \
1773                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1774                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1775                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1776                                 minproj = _mm_min_ss(minproj, proj); \
1777                                 maxproj = _mm_max_ss(maxproj, proj); \
1778                         } \
1779                         if (!(clipmask&(1<<(k^2)))) \
1780                         { \
1781                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1782                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1783                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1784                                 minproj = _mm_min_ss(minproj, proj); \
1785                                 maxproj = _mm_max_ss(maxproj, proj); \
1786                         } \
1787                         if (!(clipmask&(1<<(k^4)))) \
1788                         { \
1789                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1790                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1791                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1792                                 minproj = _mm_min_ss(minproj, proj); \
1793                                 maxproj = _mm_max_ss(maxproj, proj); \
1794                         } \
1795                 } \
1796         }
1797         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1798         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1799         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1800         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1801         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1802         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1803         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1804         *starty = _mm_cvttss_si32(maxproj);
1805         *endy = _mm_cvttss_si32(minproj)+1;
1806         return clipmask;
1807 }
1808         
1809 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1810 {
1811         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1812         float *end = out4f + numitems*4;
1813         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1814         __m128 minpos, maxpos;
1815         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1816         {
1817                 minpos = maxpos = _mm_loadu_ps(in4f);
1818                 while (out4f < end)
1819                 {
1820                         __m128 v = _mm_loadu_ps(in4f);
1821                         minpos = _mm_min_ps(minpos, v);
1822                         maxpos = _mm_max_ps(maxpos, v);
1823                         _mm_store_ps(out4f, v);
1824                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1825                         _mm_store_ps(screen4f, v);
1826                         in4f += 4;
1827                         out4f += 4;
1828                         screen4f += 4;
1829                 }
1830         }
1831         else
1832         {
1833                 minpos = maxpos = _mm_load_ps(in4f);
1834                 while (out4f < end)
1835                 {
1836                         __m128 v = _mm_load_ps(in4f);
1837                         minpos = _mm_min_ps(minpos, v);
1838                         maxpos = _mm_max_ps(maxpos, v);
1839                         _mm_store_ps(out4f, v);
1840                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1841                         _mm_store_ps(screen4f, v);
1842                         in4f += 4;
1843                         out4f += 4;
1844                         screen4f += 4;
1845                 }
1846         }
1847         if (starty && endy) 
1848         {
1849                 ALIGN(float minposf[4]);
1850                 ALIGN(float maxposf[4]);
1851                 _mm_store_ps(minposf, minpos);
1852                 _mm_store_ps(maxposf, maxpos);
1853                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1854         }
1855         return 0;
1856 }
1857
1858 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1859 {
1860         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1861         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1862         float *end;
1863         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1864                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1865         end = out4f + numitems*4;
1866         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1867         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1868         m0 = _mm_loadu_ps(inmatrix16f);
1869         m1 = _mm_loadu_ps(inmatrix16f + 4);
1870         m2 = _mm_loadu_ps(inmatrix16f + 8);
1871         m3 = _mm_loadu_ps(inmatrix16f + 12);
1872         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1873         {
1874                 minpos = maxpos = _mm_loadu_ps(in4f);
1875                 while (out4f < end)
1876                 {
1877                         __m128 v = _mm_loadu_ps(in4f);
1878                         minpos = _mm_min_ps(minpos, v);
1879                         maxpos = _mm_max_ps(maxpos, v);
1880                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1881                         _mm_store_ps(out4f, v);
1882                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1883                         _mm_store_ps(screen4f, v);
1884                         in4f += 4;
1885                         out4f += 4;
1886                         screen4f += 4;
1887                 }
1888         }
1889         else
1890         {
1891                 minpos = maxpos = _mm_load_ps(in4f);
1892                 while (out4f < end)
1893                 {
1894                         __m128 v = _mm_load_ps(in4f);
1895                         minpos = _mm_min_ps(minpos, v);
1896                         maxpos = _mm_max_ps(maxpos, v);
1897                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1898                         _mm_store_ps(out4f, v);
1899                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1900                         _mm_store_ps(screen4f, v);
1901                         in4f += 4;
1902                         out4f += 4;
1903                         screen4f += 4;
1904                 }
1905         }
1906         if (starty && endy) 
1907         {
1908                 ALIGN(float minposf[4]);
1909                 ALIGN(float maxposf[4]);
1910                 _mm_store_ps(minposf, minpos);
1911                 _mm_store_ps(maxposf, maxpos);
1912                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1913         }
1914         return 0;
1915 }
1916 #endif
1917
1918 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1919 {
1920 #ifdef SSE_POSSIBLE
1921         float *outf = dpsoftrast.post_array4f[outarray];
1922         const unsigned char *inb;
1923         int firstvertex = dpsoftrast.firstvertex;
1924         int numvertices = dpsoftrast.numvertices;
1925         int stride;
1926         switch(inarray)
1927         {
1928         case DPSOFTRAST_ARRAY_POSITION:
1929                 stride = dpsoftrast.stride_vertex;
1930                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1931                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1932                 break;
1933         case DPSOFTRAST_ARRAY_COLOR:
1934                 stride = dpsoftrast.stride_color;
1935                 if (dpsoftrast.pointer_color4f)
1936                 {
1937                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1938                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1939                 }
1940                 else if (dpsoftrast.pointer_color4ub)
1941                 {
1942                         stride = dpsoftrast.stride_color;
1943                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1944                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1945                 }
1946                 else
1947                 {
1948                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1949                 }
1950                 break;
1951         default:
1952                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1953                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1954                 {
1955                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1956                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1957                         {
1958                         case 2:
1959                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1960                                 break;
1961                         case 3:
1962                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1963                                 break;
1964                         case 4:
1965                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1966                                 break;
1967                         }
1968                 }
1969                 break;
1970         }
1971         return outf;
1972 #else
1973         return NULL;
1974 #endif
1975 }
1976
1977 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1978 {
1979         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1980         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1981         return data;
1982 }
1983
1984 #if 0
1985 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1986 {
1987 #ifdef SSE_POSSIBLE
1988         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1989         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1990         return data;
1991 #else
1992         return NULL;
1993 #endif
1994 }
1995 #endif
1996
1997 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
1998 {
1999 #ifdef SSE_POSSIBLE
2000         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2001         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2002         return data;
2003 #else
2004         return NULL;
2005 #endif
2006 }
2007
2008 void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2009 {
2010         int x;
2011         int startx = span->startx;
2012         int endx = span->endx;
2013         float wslope = triangle->w[0];
2014         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2015         float endz = 1.0f / (w + wslope * startx);
2016         if (triangle->w[0] == 0)
2017         {
2018                 // LordHavoc: fast flat polygons (HUD/menu)
2019                 for (x = startx;x < endx;x++)
2020                         zf[x] = endz;
2021                 return;
2022         }
2023         for (x = startx;x < endx;)
2024         {
2025                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2026                 float z = endz, dz;
2027                 if (nextsub >= endx) nextsub = endsub = endx-1;
2028                 endz = 1.0f / (w + wslope * nextsub);
2029                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2030                 for (; x <= endsub; x++, z += dz)
2031                         zf[x] = z;
2032         }
2033 }
2034
2035 void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2036 {
2037 #ifdef SSE_POSSIBLE
2038         int x;
2039         int startx = span->startx;
2040         int endx = span->endx;
2041         int maskx;
2042         int subx;
2043         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2044         unsigned char * RESTRICT pixelmask = span->pixelmask;
2045         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2046         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2047         if (!pixel)
2048                 return;
2049         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2050         pixeli += span->y * dpsoftrast.fb_width + span->x;
2051         // handle alphatest now (this affects depth writes too)
2052         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2053                 for (x = startx;x < endx;x++)
2054                         if (in4ub[x*4+3] < 128)
2055                                 pixelmask[x] = false;
2056         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2057         // helps sprites, text and hud artwork
2058         switch(thread->fb_blendmode)
2059         {
2060         case DPSOFTRAST_BLENDMODE_ALPHA:
2061         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2062         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2063                 maskx = startx;
2064                 for (x = startx;x < endx;x++)
2065                 {
2066                         if (in4ub[x*4+3] >= 1)
2067                         {
2068                                 startx = x;
2069                                 for (;;)
2070                                 {
2071                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2072                                         maskx = x;
2073                                         if (x >= endx) break;
2074                                         ++x;
2075                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2076                                         if (x >= endx) break;
2077                                 }
2078                                 break;
2079                         }
2080                 }
2081                 endx = maskx;
2082                 break;
2083         case DPSOFTRAST_BLENDMODE_OPAQUE:
2084         case DPSOFTRAST_BLENDMODE_ADD:
2085         case DPSOFTRAST_BLENDMODE_INVMOD:
2086         case DPSOFTRAST_BLENDMODE_MUL:
2087         case DPSOFTRAST_BLENDMODE_MUL2:
2088         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2089         case DPSOFTRAST_BLENDMODE_INVADD:
2090                 break;
2091         }
2092         // put some special values at the end of the mask to ensure the loops end
2093         pixelmask[endx] = 1;
2094         pixelmask[endx+1] = 0;
2095         // LordHavoc: use a double loop to identify subspans, this helps the
2096         // optimized copy/blend loops to perform at their best, most triangles
2097         // have only one run of pixels, and do the search using wide reads...
2098         x = startx;
2099         while (x < endx)
2100         {
2101                 // if this pixel is masked off, it's probably not alone...
2102                 if (!pixelmask[x])
2103                 {
2104                         x++;
2105 #if 1
2106                         if (x + 8 < endx)
2107                         {
2108                                 // the 4-item search must be aligned or else it stalls badly
2109                                 if ((x & 3) && !pixelmask[x]) 
2110                                 {
2111                                         if(pixelmask[x]) goto endmasked;
2112                                         x++;
2113                                         if (x & 3)
2114                                         {
2115                                                 if(pixelmask[x]) goto endmasked;
2116                                                 x++;
2117                                                 if (x & 3)
2118                                                 {
2119                                                         if(pixelmask[x]) goto endmasked;
2120                                                         x++;
2121                                                 }
2122                                         }
2123                                 }
2124                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2125                                         x += 4;
2126                         }
2127 #endif
2128                         for (;!pixelmask[x];x++)
2129                                 ;
2130                         // rather than continue the loop, just check the end variable
2131                         if (x >= endx)
2132                                 break;
2133                 }
2134         endmasked:
2135                 // find length of subspan
2136                 subx = x + 1;
2137 #if 1
2138                 if (subx + 8 < endx)
2139                 {
2140                         if (subx & 3)
2141                         {
2142                                 if(!pixelmask[subx]) goto endunmasked;
2143                                 subx++;
2144                                 if (subx & 3)
2145                                 {
2146                                         if(!pixelmask[subx]) goto endunmasked;
2147                                         subx++;
2148                                         if (subx & 3)
2149                                         {
2150                                                 if(!pixelmask[subx]) goto endunmasked;
2151                                                 subx++;
2152                                         }
2153                                 }
2154                         }
2155                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2156                                 subx += 4;
2157                 }
2158 #endif
2159                 for (;pixelmask[subx];subx++)
2160                         ;
2161                 // the checks can overshoot, so make sure to clip it...
2162                 if (subx > endx)
2163                         subx = endx;
2164         endunmasked:
2165                 // now that we know the subspan length...  process!
2166                 switch(thread->fb_blendmode)
2167                 {
2168                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2169 #if 0
2170                         if (subx - x >= 16)
2171                         {
2172                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2173                                 x = subx;
2174                         }
2175                         else
2176 #elif 1
2177                         while (x + 16 <= subx)
2178                         {
2179                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2180                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2181                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2182                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2183                                 x += 16;
2184                         }
2185 #endif
2186                         {
2187                                 while (x + 4 <= subx)
2188                                 {
2189                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2190                                         x += 4;
2191                                 }
2192                                 if (x + 2 <= subx)
2193                                 {
2194                                         pixeli[x] = ini[x];
2195                                         pixeli[x+1] = ini[x+1];
2196                                         x += 2;
2197                                 }
2198                                 if (x < subx)
2199                                 {
2200                                         pixeli[x] = ini[x];
2201                                         x++;
2202                                 }
2203                         }
2204                         break;
2205                 case DPSOFTRAST_BLENDMODE_ALPHA:
2206                 #define FINISHBLEND(blend2, blend1) \
2207                         for (;x + 1 < subx;x += 2) \
2208                         { \
2209                                 __m128i src, dst; \
2210                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2211                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2212                                 blend2; \
2213                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2214                         } \
2215                         if (x < subx) \
2216                         { \
2217                                 __m128i src, dst; \
2218                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2219                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2220                                 blend1; \
2221                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2222                                 x++; \
2223                         }
2224                         FINISHBLEND({
2225                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2226                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2227                         }, {
2228                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2229                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2230                         });
2231                         break;
2232                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2233                         FINISHBLEND({
2234                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2235                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2236                         }, {
2237                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2238                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2239                         });
2240                         break;
2241                 case DPSOFTRAST_BLENDMODE_ADD:
2242                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2243                         break;
2244                 case DPSOFTRAST_BLENDMODE_INVMOD:
2245                         FINISHBLEND({
2246                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2247                         }, {
2248                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2249                         });
2250                         break;
2251                 case DPSOFTRAST_BLENDMODE_MUL:
2252                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2253                         break;
2254                 case DPSOFTRAST_BLENDMODE_MUL2:
2255                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2256                         break;
2257                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2258                         FINISHBLEND({
2259                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2260                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2261                         }, {
2262                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2263                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2264                         });
2265                         break;
2266                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2267                         FINISHBLEND({
2268                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2269                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2270                         }, {
2271                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2272                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2273                         });
2274                         break;
2275                 case DPSOFTRAST_BLENDMODE_INVADD:
2276                         FINISHBLEND({
2277                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2278                         }, {
2279                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2280                         });
2281                         break;
2282                 }
2283         }
2284 #endif
2285 }
2286
2287 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2288         // warning: this is SLOW, only use if the optimized per-span functions won't do
2289 {
2290         const unsigned char * RESTRICT pixelbase;
2291         const unsigned char * RESTRICT pixel[4];
2292         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2293         int wrapmask[2] = { width-1, height-1 };
2294         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2295         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2296         {
2297                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2298                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2299                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2300                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2301                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2302                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2303                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2304                 {
2305                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2306                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2307                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2308                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2309                 }
2310                 else
2311                 {
2312                         tci[0] &= wrapmask[0];
2313                         tci[1] &= wrapmask[1];
2314                         tci1[0] &= wrapmask[0];
2315                         tci1[1] &= wrapmask[1];
2316                 }
2317                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2318                 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2319                 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2320                 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2321                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2322                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2323                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2324                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2325         }
2326         else
2327         {
2328                 int tci[2] = { x * width, y * height };
2329                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2330                 {
2331                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2332                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2333                 }
2334                 else
2335                 {
2336                         tci[0] &= wrapmask[0];
2337                         tci[1] &= wrapmask[1];
2338                 }
2339                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2340                 c[0] = pixel[0][0];
2341                 c[1] = pixel[0][1];
2342                 c[2] = pixel[0][2];
2343                 c[3] = pixel[0][3];
2344         }
2345 }
2346
2347 void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2348 {
2349         int x;
2350         int startx = span->startx;
2351         int endx = span->endx;
2352         int flags;
2353         float c[4];
2354         float data[4];
2355         float slope[4];
2356         float tc[2], endtc[2];
2357         float tcscale[2];
2358         unsigned int tci[2];
2359         unsigned int tci1[2];
2360         unsigned int tcimin[2];
2361         unsigned int tcimax[2];
2362         int tciwrapmask[2];
2363         int tciwidth;
2364         int filter;
2365         int mip;
2366         const unsigned char * RESTRICT pixelbase;
2367         const unsigned char * RESTRICT pixel[4];
2368         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2369         // if no texture is bound, just fill it with white
2370         if (!texture)
2371         {
2372                 for (x = startx;x < endx;x++)
2373                 {
2374                         out4f[x*4+0] = 1.0f;
2375                         out4f[x*4+1] = 1.0f;
2376                         out4f[x*4+2] = 1.0f;
2377                         out4f[x*4+3] = 1.0f;
2378                 }
2379                 return;
2380         }
2381         mip = triangle->mip[texunitindex];
2382         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2383         // if this mipmap of the texture is 1 pixel, just fill it with that color
2384         if (texture->mipmap[mip][1] == 4)
2385         {
2386                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2387                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2388                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2389                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2390                 for (x = startx;x < endx;x++)
2391                 {
2392                         out4f[x*4+0] = c[0];
2393                         out4f[x*4+1] = c[1];
2394                         out4f[x*4+2] = c[2];
2395                         out4f[x*4+3] = c[3];
2396                 }
2397                 return;
2398         }
2399         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2400         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2401         flags = texture->flags;
2402         tcscale[0] = texture->mipmap[mip][2];
2403         tcscale[1] = texture->mipmap[mip][3];
2404         tciwidth = texture->mipmap[mip][2];
2405         tcimin[0] = 0;
2406         tcimin[1] = 0;
2407         tcimax[0] = texture->mipmap[mip][2]-1;
2408         tcimax[1] = texture->mipmap[mip][3]-1;
2409         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2410         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2411         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2412         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2413         if (filter)
2414         {
2415                 endtc[0] -= 0.5f;
2416                 endtc[1] -= 0.5f;
2417         }
2418         for (x = startx;x < endx;)
2419         {
2420                 unsigned int subtc[2];
2421                 unsigned int substep[2];
2422                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2423                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2424                 if (nextsub >= endx)
2425                 {
2426                         nextsub = endsub = endx-1;      
2427                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2428                 }
2429                 tc[0] = endtc[0];
2430                 tc[1] = endtc[1];
2431                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2432                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2433                 if (filter)
2434                 {
2435                         endtc[0] -= 0.5f;
2436                         endtc[1] -= 0.5f;
2437                 }
2438                 substep[0] = (endtc[0] - tc[0]) * subscale;
2439                 substep[1] = (endtc[1] - tc[1]) * subscale;
2440                 subtc[0] = tc[0] * (1<<12);
2441                 subtc[1] = tc[1] * (1<<12);
2442                 if (filter)
2443                 {
2444                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2445                         {
2446                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2447                                 {
2448                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2449                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2450                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2451                                         tci[0] = subtc[0]>>12;
2452                                         tci[1] = subtc[1]>>12;
2453                                         tci1[0] = tci[0] + 1;
2454                                         tci1[1] = tci[1] + 1;
2455                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2456                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2457                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2458                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2459                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2460                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2461                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2462                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2463                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2464                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2465                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2466                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2467                                         out4f[x*4+0] = c[0];
2468                                         out4f[x*4+1] = c[1];
2469                                         out4f[x*4+2] = c[2];
2470                                         out4f[x*4+3] = c[3];
2471                                 }
2472                         }
2473                         else
2474                         {
2475                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2476                                 {
2477                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2478                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2479                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2480                                         tci[0] = subtc[0]>>12;
2481                                         tci[1] = subtc[1]>>12;
2482                                         tci1[0] = tci[0] + 1;
2483                                         tci1[1] = tci[1] + 1;
2484                                         tci[0] &= tciwrapmask[0];
2485                                         tci[1] &= tciwrapmask[1];
2486                                         tci1[0] &= tciwrapmask[0];
2487                                         tci1[1] &= tciwrapmask[1];
2488                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2489                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2490                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2491                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2492                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2493                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2494                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2495                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2496                                         out4f[x*4+0] = c[0];
2497                                         out4f[x*4+1] = c[1];
2498                                         out4f[x*4+2] = c[2];
2499                                         out4f[x*4+3] = c[3];
2500                                 }
2501                         }
2502                 }
2503                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2504                 {
2505                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2506                         {
2507                                 tci[0] = subtc[0]>>12;
2508                                 tci[1] = subtc[1]>>12;
2509                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2510                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2511                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2512                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2513                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2514                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2515                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2516                                 out4f[x*4+0] = c[0];
2517                                 out4f[x*4+1] = c[1];
2518                                 out4f[x*4+2] = c[2];
2519                                 out4f[x*4+3] = c[3];
2520                         }
2521                 }
2522                 else
2523                 {
2524                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2525                         {
2526                                 tci[0] = subtc[0]>>12;
2527                                 tci[1] = subtc[1]>>12;
2528                                 tci[0] &= tciwrapmask[0];
2529                                 tci[1] &= tciwrapmask[1];
2530                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2531                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2532                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2533                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2534                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2535                                 out4f[x*4+0] = c[0];
2536                                 out4f[x*4+1] = c[1];
2537                                 out4f[x*4+2] = c[2];
2538                                 out4f[x*4+3] = c[3];
2539                         }
2540                 }
2541         }
2542 }
2543
2544 void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2545 {
2546 #ifdef SSE_POSSIBLE
2547         int x;
2548         int startx = span->startx;
2549         int endx = span->endx;
2550         int flags;
2551         __m128 data, slope, tcscale;
2552         __m128i tcsize, tcmask, tcoffset, tcmax;
2553         __m128 tc, endtc;
2554         __m128i subtc, substep, endsubtc;
2555         int filter;
2556         int mip;
2557         int affine; // LordHavoc: optimized affine texturing case
2558         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2559         const unsigned char * RESTRICT pixelbase;
2560         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2561         // if no texture is bound, just fill it with white
2562         if (!texture)
2563         {
2564                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2565                 return;
2566         }
2567         mip = triangle->mip[texunitindex];
2568         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2569         // if this mipmap of the texture is 1 pixel, just fill it with that color
2570         if (texture->mipmap[mip][1] == 4)
2571         {
2572                 unsigned int k = *((const unsigned int *)pixelbase);
2573                 for (x = startx;x < endx;x++)
2574                         outi[x] = k;
2575                 return;
2576         }
2577         affine = zf[startx] == zf[endx-1];
2578         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2579         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2580         flags = texture->flags;
2581         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2582         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2583         tcscale = _mm_cvtepi32_ps(tcsize);
2584         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2585         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2586         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2587         if (filter)
2588                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2589         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2590         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2591         tcmax = _mm_packs_epi32(tcmask, tcmask);
2592         for (x = startx;x < endx;)
2593         {
2594                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2595                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2596                 if (nextsub >= endx || affine)
2597                 {
2598                         nextsub = endsub = endx-1;
2599                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2600                 }       
2601                 tc = endtc;
2602                 subtc = endsubtc;
2603                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2604                 if (filter)
2605                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2606                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2607                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2608                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2609                 substep = _mm_slli_epi32(substep, 1);
2610                 if (filter)
2611                 {
2612                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2613                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2614                         {
2615                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2616                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2617                                 {
2618                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2619                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2620                                         tci = _mm_madd_epi16(tci, tcoffset);
2621                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2622                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2623                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2624                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2625                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2626                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2627                                         fracm = _mm_srli_epi16(subtc, 1);
2628                                         pix1 = _mm_add_epi16(pix1,
2629                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2630                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2631                                         pix3 = _mm_add_epi16(pix3,
2632                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2633                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2634                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2635                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2636                                         pix2 = _mm_add_epi16(pix2,
2637                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2638                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2639                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2640                                 }
2641                                 if (x <= endsub)
2642                                 {
2643                                         const unsigned char * RESTRICT ptr1;
2644                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2645                                         tci = _mm_madd_epi16(tci, tcoffset);
2646                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2647                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2648                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2649                                         fracm = _mm_srli_epi16(subtc, 1);
2650                                         pix1 = _mm_add_epi16(pix1,
2651                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2652                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2653                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2654                                         pix1 = _mm_add_epi16(pix1,
2655                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2656                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2657                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2658                                         x++;
2659                                 }
2660                         }
2661                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2662                         {
2663                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2664                                 {
2665                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2666                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2667                                         tci = _mm_madd_epi16(tci, tcoffset);
2668                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2669                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2670                                                                                         _mm_setzero_si128());
2671                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2672                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2673                                                                                         _mm_setzero_si128());
2674                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2675                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2676                                         tci = _mm_madd_epi16(tci, tcoffset);
2677                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2678                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2679                                                                                         _mm_setzero_si128());
2680                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2681                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2682                                                                                         _mm_setzero_si128());
2683                                         fracm = _mm_srli_epi16(subtc, 1);
2684                                         pix1 = _mm_add_epi16(pix1,
2685                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2686                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2687                                         pix3 = _mm_add_epi16(pix3,
2688                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2689                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2690                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2691                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2692                                         pix2 = _mm_add_epi16(pix2,
2693                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2694                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2695                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2696                                 }
2697                                 if (x <= endsub)
2698                                 {
2699                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2700                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2701                                         tci = _mm_madd_epi16(tci, tcoffset);
2702                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2703                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2704                                                                                         _mm_setzero_si128());
2705                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2706                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2707                                                                                         _mm_setzero_si128());
2708                                         fracm = _mm_srli_epi16(subtc, 1);
2709                                         pix1 = _mm_add_epi16(pix1,
2710                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2711                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2712                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2713                                         pix1 = _mm_add_epi16(pix1,
2714                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2715                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2716                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2717                                         x++;
2718                                 }
2719                         }
2720                         else
2721                         {
2722                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2723                                 {
2724                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2725                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2726                                         tci = _mm_madd_epi16(tci, tcoffset);
2727                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2728                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2729                                                                                         _mm_setzero_si128());
2730                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2731                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2732                                                                                         _mm_setzero_si128());
2733                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2734                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2735                                         tci = _mm_madd_epi16(tci, tcoffset);
2736                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2737                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2738                                                                                         _mm_setzero_si128());
2739                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2740                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2741                                                                                         _mm_setzero_si128());
2742                                         fracm = _mm_srli_epi16(subtc, 1);
2743                                         pix1 = _mm_add_epi16(pix1,
2744                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2745                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2746                                         pix3 = _mm_add_epi16(pix3,
2747                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2748                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2749                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2750                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2751                                         pix2 = _mm_add_epi16(pix2,
2752                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2753                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2754                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2755                                 }
2756                                 if (x <= endsub)
2757                                 {
2758                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2759                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2760                                         tci = _mm_madd_epi16(tci, tcoffset);
2761                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2762                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2763                                                                                         _mm_setzero_si128());
2764                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2765                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2766                                                                                         _mm_setzero_si128());
2767                                         fracm = _mm_srli_epi16(subtc, 1);
2768                                         pix1 = _mm_add_epi16(pix1,
2769                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2770                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2771                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2772                                         pix1 = _mm_add_epi16(pix1,
2773                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2774                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2775                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2776                                         x++;
2777                                 }
2778                         }
2779                 }
2780                 else
2781                 {
2782                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2783                         {
2784                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2785                                 {
2786                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2787                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2788                                         tci = _mm_madd_epi16(tci, tcoffset);
2789                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2790                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2791                                 }
2792                                 if (x <= endsub)
2793                                 {
2794                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2795                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2796                                         tci = _mm_madd_epi16(tci, tcoffset);
2797                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2798                                         x++;
2799                                 }
2800                         }
2801                         else
2802                         {
2803                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2804                                 {
2805                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2806                                         tci = _mm_and_si128(tci, tcmax); 
2807                                         tci = _mm_madd_epi16(tci, tcoffset);
2808                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2809                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2810                                 }
2811                                 if (x <= endsub)
2812                                 {
2813                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2814                                         tci = _mm_and_si128(tci, tcmax); 
2815                                         tci = _mm_madd_epi16(tci, tcoffset);
2816                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2817                                         x++;
2818                                 }
2819                         }
2820                 }
2821         }
2822 #endif
2823 }
2824
2825 void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2826 {
2827         // TODO: IMPLEMENT
2828         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2829 }
2830
2831 float DPSOFTRAST_SampleShadowmap(const float *vector)
2832 {
2833         // TODO: IMPLEMENT
2834         return 1.0f;
2835 }
2836
2837 void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2838 {
2839         int x;
2840         int startx = span->startx;
2841         int endx = span->endx;
2842         float c[4];
2843         float data[4];
2844         float slope[4];
2845         float z;
2846         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2847         for (x = startx;x < endx;x++)
2848         {
2849                 z = zf[x];
2850                 c[0] = (data[0] + slope[0]*x) * z;
2851                 c[1] = (data[1] + slope[1]*x) * z;
2852                 c[2] = (data[2] + slope[2]*x) * z;
2853                 c[3] = (data[3] + slope[3]*x) * z;
2854                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2855                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2856                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2857                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2858         }
2859 }
2860
2861 void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2862 {
2863         int x;
2864         int startx = span->startx;
2865         int endx = span->endx;
2866         float c[4];
2867         float data[4];
2868         float slope[4];
2869         float z;
2870         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2871         for (x = startx;x < endx;x++)
2872         {
2873                 z = zf[x];
2874                 c[0] = (data[0] + slope[0]*x) * z;
2875                 c[1] = (data[1] + slope[1]*x) * z;
2876                 c[2] = (data[2] + slope[2]*x) * z;
2877                 c[3] = (data[3] + slope[3]*x) * z;
2878                 out4f[x*4+0] = c[0];
2879                 out4f[x*4+1] = c[1];
2880                 out4f[x*4+2] = c[2];
2881                 out4f[x*4+3] = c[3];
2882         }
2883 }
2884
2885 void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2886 {
2887         int x, startx = span->startx, endx = span->endx;
2888         float c[4], localcolor[4];
2889         localcolor[0] = subcolor[0];
2890         localcolor[1] = subcolor[1];
2891         localcolor[2] = subcolor[2];
2892         localcolor[3] = subcolor[3];
2893         for (x = startx;x < endx;x++)
2894         {
2895                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2896                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2897                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2898                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2899                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2900                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2901                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2902                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2903         }
2904 }
2905
2906 void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2907 {
2908         int x, startx = span->startx, endx = span->endx;
2909         for (x = startx;x < endx;x++)
2910         {
2911                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2912                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2913                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2914                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2915         }
2916 }
2917
2918 void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2919 {
2920         int x, startx = span->startx, endx = span->endx;
2921         for (x = startx;x < endx;x++)
2922         {
2923                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2924                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2925                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2926                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2927         }
2928 }
2929
2930 void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2931 {
2932         int x, startx = span->startx, endx = span->endx;
2933         float a, b;
2934         for (x = startx;x < endx;x++)
2935         {
2936                 a = 1.0f - inb4f[x*4+3];
2937                 b = inb4f[x*4+3];
2938                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2939                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2940                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2941                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2942         }
2943 }
2944
2945 void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2946 {
2947         int x, startx = span->startx, endx = span->endx;
2948         float localcolor[4], ilerp, lerp;
2949         localcolor[0] = color[0];
2950         localcolor[1] = color[1];
2951         localcolor[2] = color[2];
2952         localcolor[3] = color[3];
2953         ilerp = 1.0f - localcolor[3];
2954         lerp = localcolor[3];
2955         for (x = startx;x < endx;x++)
2956         {
2957                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2958                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2959                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2960                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2961         }
2962 }
2963
2964
2965
2966 void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2967 {
2968 #ifdef SSE_POSSIBLE
2969         int x;
2970         int startx = span->startx;
2971         int endx = span->endx;
2972         __m128 data, slope;
2973         __m128 mod, endmod;
2974         __m128i submod, substep, endsubmod;
2975         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2976         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2977         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2978         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2979         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2980         for (x = startx; x < endx;)
2981         {
2982                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2983                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2984                 if (nextsub >= endx)
2985                 {
2986                         nextsub = endsub = endx-1;
2987                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
2988                 }
2989                 mod = endmod;
2990                 submod = endsubmod;
2991                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2992                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
2993                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2994                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
2995                 substep = _mm_packs_epi32(substep, substep);
2996                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
2997                 {
2998                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
2999                         pix = _mm_mulhi_epu16(pix, submod);
3000                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3001                 }
3002                 if (x <= endsub)
3003                 {
3004                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3005                         pix = _mm_mulhi_epu16(pix, submod);
3006                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3007                         x++;
3008                 }
3009         }
3010 #endif
3011 }
3012
3013 void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3014 {
3015 #ifdef SSE_POSSIBLE
3016         int x;
3017         int startx = span->startx;
3018         int endx = span->endx;
3019         __m128 data, slope;
3020         __m128 mod, endmod;
3021         __m128i submod, substep, endsubmod;
3022         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3023         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3024         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3025         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3026         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3027         for (x = startx; x < endx;)
3028         {
3029                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3030                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3031                 if (nextsub >= endx)
3032                 {
3033                         nextsub = endsub = endx-1;
3034                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3035                 }
3036                 mod = endmod;
3037                 submod = endsubmod;
3038                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3039                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3040                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3041                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3042                 substep = _mm_packs_epi32(substep, substep);
3043                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3044                 {
3045                         __m128i pix = _mm_srai_epi16(submod, 4);
3046                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3047                 }
3048                 if (x <= endsub)
3049                 {
3050                         __m128i pix = _mm_srai_epi16(submod, 4);
3051                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3052                         x++;
3053                 }
3054         }
3055 #endif
3056 }
3057
3058 void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3059 {
3060 #ifdef SSE_POSSIBLE
3061         int x, startx = span->startx, endx = span->endx;
3062         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3063         localcolor = _mm_packs_epi32(localcolor, localcolor);
3064         for (x = startx;x+2 <= endx;x+=2)
3065         {
3066                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3067                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3068                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3069                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3070         }
3071         if (x < endx)
3072         {
3073                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3074                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3075                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3076                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3077         }
3078 #endif
3079 }
3080
3081 void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3082 {
3083 #ifdef SSE_POSSIBLE
3084         int x, startx = span->startx, endx = span->endx;
3085         for (x = startx;x+2 <= endx;x+=2)
3086         {
3087                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3088                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3089                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3090                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3091         }
3092         if (x < endx)
3093         {
3094                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3095                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3096                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3097                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3098         }
3099 #endif
3100 }
3101
3102 void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3103 {
3104 #ifdef SSE_POSSIBLE
3105         int x, startx = span->startx, endx = span->endx;
3106         for (x = startx;x+2 <= endx;x+=2)
3107         {
3108                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3109                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3110                 pix1 = _mm_add_epi16(pix1, pix2);
3111                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3112         }
3113         if (x < endx)
3114         {
3115                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3116                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3117                 pix1 = _mm_add_epi16(pix1, pix2);
3118                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3119         }
3120 #endif
3121 }
3122
3123 void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3124 {
3125 #ifdef SSE_POSSIBLE
3126         int x, startx = span->startx, endx = span->endx;
3127         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3128         tint = _mm_packs_epi32(tint, tint);
3129         for (x = startx;x+2 <= endx;x+=2)
3130         {
3131                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3132                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3133                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3134                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3135         }
3136         if (x < endx)
3137         {
3138                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3139                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3140                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3141                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3142         }
3143 #endif
3144 }
3145
3146 void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3147 {
3148 #ifdef SSE_POSSIBLE
3149         int x, startx = span->startx, endx = span->endx;
3150         for (x = startx;x+2 <= endx;x+=2)
3151         {
3152                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3153                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3154                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3155                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3156                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3157         }
3158         if (x < endx)
3159         {
3160                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3161                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3162                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3163                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3164                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3165         }
3166 #endif
3167 }
3168
3169 void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3170 {
3171 #ifdef SSE_POSSIBLE
3172         int x, startx = span->startx, endx = span->endx;
3173         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3174         localcolor = _mm_packs_epi32(localcolor, localcolor);
3175         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3176         for (x = startx;x+2 <= endx;x+=2)
3177         {
3178                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3179                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3180                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3181         }
3182         if (x < endx)
3183         {
3184                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3185                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3186                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3187         }
3188 #endif
3189 }
3190
3191
3192
3193 void DPSOFTRAST_VertexShader_Generic(void)
3194 {
3195         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3196         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3197         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3198         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3199                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3200 }
3201
3202 void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3203 {
3204         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3205         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3206         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3207         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3208         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3209         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3210         {
3211                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3212                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3213                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3214                 {
3215                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3216                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3217                         {
3218                                 // multiply
3219                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3220                         }
3221                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3222                         {
3223                                 // add
3224                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3225                         }
3226                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3227                         {
3228                                 // alphablend
3229                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3230                         }
3231                 }
3232         }
3233         else
3234                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3235         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3236 }
3237
3238
3239
3240 void DPSOFTRAST_VertexShader_PostProcess(void)
3241 {
3242         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3243         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3244         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3245 }
3246
3247 void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3248 {
3249         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3250         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3251         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3252         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3253         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3254         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3255         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3256         {
3257                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3258                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3259         }
3260         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3261         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3262         {
3263                 // TODO: implement saturation
3264         }
3265         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3266         {
3267                 // TODO: implement gammaramps
3268         }
3269         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3270 }
3271
3272
3273
3274 void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3275 {
3276         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3277 }
3278
3279 void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3280 {
3281         // this is never called (because colormask is off when this shader is used)
3282         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3283         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3284         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3285         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3286         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3287 }
3288
3289
3290
3291 void DPSOFTRAST_VertexShader_FlatColor(void)
3292 {
3293         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3294         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3295 }
3296
3297 void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3298 {
3299 #ifdef SSE_POSSIBLE
3300         unsigned char * RESTRICT pixelmask = span->pixelmask;
3301         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3302         int x, startx = span->startx, endx = span->endx;
3303         __m128i Color_Ambientm;
3304         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3305         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3306         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3307         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3308         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3309         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3310                 pixel = buffer_FragColorbgra8;
3311         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3312         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3313         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3314         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3315         for (x = startx;x < endx;x++)
3316         {
3317                 __m128i color, pix;
3318                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3319                 {
3320                         __m128i pix2;
3321                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3322                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3323                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3324                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3325                         x += 3;
3326                         continue;
3327                 }
3328                 if (!pixelmask[x])
3329                         continue;
3330                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3331                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3332                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3333         }
3334         if (pixel == buffer_FragColorbgra8)
3335                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3336 #endif
3337 }
3338
3339
3340
3341 void DPSOFTRAST_VertexShader_VertexColor(void)
3342 {
3343         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3344         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3345         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3346 }
3347
3348 void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3349 {
3350 #ifdef SSE_POSSIBLE
3351         unsigned char * RESTRICT pixelmask = span->pixelmask;
3352         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3353         int x, startx = span->startx, endx = span->endx;
3354         __m128i Color_Ambientm, Color_Diffusem;
3355         __m128 data, slope;
3356         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3357         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3358         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3359         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3360         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3361         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3362         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3363                 pixel = buffer_FragColorbgra8;
3364         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3365         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3366         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3367         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3368         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3369         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3370         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3371         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3372         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3373         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3374         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3375         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3376         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3377         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3378         {
3379                 __m128i color, mod, pix;
3380                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3381                 {
3382                         __m128i pix2, mod2;
3383                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3384                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3385                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3386                         data = _mm_add_ps(data, slope);
3387                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3388                         data = _mm_add_ps(data, slope);
3389                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3390                         data = _mm_add_ps(data, slope);
3391                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3392                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3393                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3394                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3395                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3396                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3397                         x += 3;
3398                         continue;
3399                 }
3400                 if (!pixelmask[x])
3401                         continue;
3402                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3403                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3404                 mod = _mm_packs_epi32(mod, mod);
3405                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3406                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3407         }
3408         if (pixel == buffer_FragColorbgra8)
3409                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3410 #endif
3411 }
3412
3413
3414
3415 void DPSOFTRAST_VertexShader_Lightmap(void)
3416 {
3417         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3418         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3419         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3420 }
3421
3422 void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3423 {
3424 #ifdef SSE_POSSIBLE
3425         unsigned char * RESTRICT pixelmask = span->pixelmask;
3426         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3427         int x, startx = span->startx, endx = span->endx;
3428         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3429         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3430         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3431         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3432         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3433         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3434         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3435         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3436         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3437         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3438                 pixel = buffer_FragColorbgra8;
3439         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3440         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3441         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3442         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3443         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3444         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3445         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3446         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3447         {
3448                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3449                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3450                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3451                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3452                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3453                 for (x = startx;x < endx;x++)
3454                 {
3455                         __m128i color, lightmap, glow, pix;
3456                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3457                         {
3458                                 __m128i pix2;
3459                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3460                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3461                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3462                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3463                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3464                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3465                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3466                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3467                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3468                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3469                                 x += 3;
3470                                 continue;
3471                         }
3472                         if (!pixelmask[x])
3473                                 continue;
3474                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3475                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3476                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3477                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3478                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3479                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3480                 }
3481         }
3482         else
3483         {
3484                 for (x = startx;x < endx;x++)
3485                 {
3486                         __m128i color, lightmap, pix;
3487                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3488                         {
3489                                 __m128i pix2;
3490                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3491                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3492                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3493                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3494                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3495                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3496                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3497                                 x += 3;
3498                                 continue;
3499                         }
3500                         if (!pixelmask[x]) 
3501                                 continue;
3502                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3503                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3504                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3505                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3506                 }
3507         }
3508         if (pixel == buffer_FragColorbgra8)
3509                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3510 #endif
3511 }
3512
3513
3514 void DPSOFTRAST_VertexShader_LightDirection(void);
3515 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3516
3517 void DPSOFTRAST_VertexShader_FakeLight(void)
3518 {
3519         DPSOFTRAST_VertexShader_LightDirection();
3520 }
3521
3522 void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3523 {
3524         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3525 }
3526
3527
3528
3529 void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3530 {
3531         DPSOFTRAST_VertexShader_LightDirection();
3532         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3533 }
3534
3535 void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3536 {
3537         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3538 }
3539
3540
3541
3542 void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3543 {
3544         DPSOFTRAST_VertexShader_LightDirection();
3545         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3546 }
3547
3548 void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3549 {
3550         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3551 }
3552
3553
3554
3555 void DPSOFTRAST_VertexShader_LightDirection(void)
3556 {
3557         int i;
3558         int numvertices = dpsoftrast.numvertices;
3559         float LightDir[4];
3560         float LightVector[4];
3561         float EyePosition[4];
3562         float EyeVectorModelSpace[4];
3563         float EyeVector[4];
3564         float position[4];
3565         float svector[4];
3566         float tvector[4];
3567         float normal[4];
3568         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3569         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3570         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3571         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3572         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3573         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3574         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3575         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3576         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3577         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3578         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3579         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3580         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3581         for (i = 0;i < numvertices;i++)
3582         {
3583                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3584                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3585                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3586                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3587                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3588                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3589                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3590                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3591                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3592                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3593                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3594                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3595                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3596                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3597                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3598                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3599                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3600                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3601                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3602                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3603                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3604                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3605                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3606                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3607                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3608                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3609                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3610                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3611                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3612         }
3613         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3614 }
3615
3616 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3617 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3618 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3619 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3620 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3621 #define DPSOFTRAST_Vector3Normalize(v)\
3622 do\
3623 {\
3624         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3625         if (len)\
3626         {\
3627                 len = 1.0f / len;\
3628                 v[0] *= len;\
3629                 v[1] *= len;\
3630                 v[2] *= len;\
3631         }\
3632 }\
3633 while(0)
3634
3635 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3636 {
3637         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3638         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3639         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3640         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3641         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3642         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3643         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3644         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3645         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3646         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3647         int x, startx = span->startx, endx = span->endx;
3648         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3649         float LightVectordata[4];
3650         float LightVectorslope[4];
3651         float EyeVectordata[4];
3652         float EyeVectorslope[4];
3653         float VectorSdata[4];
3654         float VectorSslope[4];
3655         float VectorTdata[4];
3656         float VectorTslope[4];
3657         float VectorRdata[4];
3658         float VectorRslope[4];
3659         float z;
3660         float diffusetex[4];
3661         float glosstex[4];
3662         float surfacenormal[4];
3663         float lightnormal[4];
3664         float lightnormal_modelspace[4];
3665         float eyenormal[4];
3666         float specularnormal[4];
3667         float diffuse;
3668         float specular;
3669         float SpecularPower;
3670         int d[4];
3671         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3672         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3673         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3674         Color_Glow[3] = 0.0f;
3675         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3676         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3677         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3678         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3679         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3680         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3681         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3682         Color_Pants[3] = 0.0f;
3683         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3684         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3685         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3686         Color_Shirt[3] = 0.0f;
3687         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3688         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3689         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3690         {
3691                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3692                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3693         }
3694         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3695         {
3696                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3697         }
3698         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3699         {
3700                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3701                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3702                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3703                 Color_Diffuse[3] = 0.0f;
3704                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3705                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3706                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3707                 LightColor[3] = 0.0f;
3708                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3709                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3710                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3711                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3712                 Color_Specular[3] = 0.0f;
3713                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3714                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3715                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3716
3717                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3718                 {
3719                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3720                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3721                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3722                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3723                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3724                 }
3725                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3726                 {
3727                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3728                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3729                 }
3730                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3731                 {
3732                         // nothing of this needed
3733                 }
3734                 else
3735                 {
3736                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3737                 }
3738
3739                 for (x = startx;x < endx;x++)
3740                 {
3741                         z = buffer_z[x];
3742                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3743                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3744                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3745                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3746                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3747                         {
3748                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3749                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3750                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3751                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3752                         }
3753                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3754                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3755                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3756                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3757                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3758                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3759                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3760                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3761
3762                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3763                         {
3764                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3765                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3766                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3767                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3768
3769                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3770                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3771                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3772                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3773
3774                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3775                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3776                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3777                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3778
3779                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3780                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3781                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3782                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3783
3784                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3785                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3786
3787                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3788                                 {
3789                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3790                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3791                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3792                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3793                                 }
3794                         }
3795                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3796                         {
3797                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3798                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3799                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3800                                 {
3801                                         float f = 1.0f / 256.0f;
3802                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3803                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3804                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3805                                 }
3806                         }
3807                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3808                         {
3809                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3810                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3811                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3812                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3813
3814                                 LightColor[0] = 1.0;
3815                                 LightColor[1] = 1.0;
3816                                 LightColor[2] = 1.0;
3817                         }
3818                         else
3819                         {
3820                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3821                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3822                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3823                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3824                         }
3825
3826                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3827
3828                         if(thread->shader_exactspecularmath)
3829                         {
3830                                 // reflect lightnormal at surfacenormal, take the negative of that
3831                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3832                                 float f;
3833                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3834                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3835                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3836                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3837
3838                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3839                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3840                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3841                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3842                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3843
3844                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3845                                 specular = pow(specular, 0.25f + SpecularPower * glosstex[3]);
3846                         }
3847                         else
3848                         {
3849                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3850                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3851                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3852                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3853
3854                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3855                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3856                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3857                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3858
3859                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3860                                 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3861                         }
3862
3863                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3864                         {
3865                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3866                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3867                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3868                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3869                         }
3870                         else
3871                         {
3872                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3873                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3874                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3875                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3876                         }
3877
3878                         buffer_FragColorbgra8[x*4+0] = d[0];
3879                         buffer_FragColorbgra8[x*4+1] = d[1];
3880                         buffer_FragColorbgra8[x*4+2] = d[2];
3881                         buffer_FragColorbgra8[x*4+3] = d[3];
3882                 }
3883         }
3884         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3885         {
3886                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3887                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3888                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3889                 Color_Diffuse[3] = 0.0f;
3890                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3891                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3892                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3893                 LightColor[3] = 0.0f;
3894                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3895
3896                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3897                 {
3898                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3899                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3900                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3901                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3902                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3903                 }
3904                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3905                 {
3906                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3907                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3908                 }
3909                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3910                 {
3911                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3912                 }
3913                 else
3914                 {
3915                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3916                 }
3917
3918                 for (x = startx;x < endx;x++)
3919                 {
3920                         z = buffer_z[x];
3921                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3922                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3923                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3924                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3925                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3926                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3927                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3928                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3929
3930                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3931                         {
3932                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3933                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3934                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3935                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3936
3937                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3938                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3939                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3940                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3941
3942                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3943                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3944                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3945                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3946
3947                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3948                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3949                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3950                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3951
3952                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3953                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3954
3955                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3956                                 {
3957                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3958                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3959                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3960                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3961                                 }
3962                         }
3963                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3964                         {
3965                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3966                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3967                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3968                                 {
3969                                         float f = 1.0f / 256.0f;
3970                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3971                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3972                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3973                                 }
3974                         }
3975                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3976                         {
3977                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3978                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3979                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3980                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3981
3982                                 LightColor[0] = 1.0;
3983                                 LightColor[1] = 1.0;
3984                                 LightColor[2] = 1.0;
3985                         }
3986                         else
3987                         {
3988                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3989                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3990                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3991                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3992                         }
3993
3994                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3995                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3996                         {
3997                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
3998                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
3999                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4000                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4001                         }
4002                         else
4003                         {
4004                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4005                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4006                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4007                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4008                         }
4009                         buffer_FragColorbgra8[x*4+0] = d[0];
4010                         buffer_FragColorbgra8[x*4+1] = d[1];
4011                         buffer_FragColorbgra8[x*4+2] = d[2];
4012                         buffer_FragColorbgra8[x*4+3] = d[3];
4013                 }
4014         }
4015         else
4016         {
4017                 for (x = startx;x < endx;x++)
4018                 {
4019                         z = buffer_z[x];
4020                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4021                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4022                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4023                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4024
4025                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4026                         {
4027                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4028                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4029                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4030                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4031                         }
4032                         else
4033                         {
4034                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4035                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4036                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4037                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4038                         }
4039                         buffer_FragColorbgra8[x*4+0] = d[0];
4040                         buffer_FragColorbgra8[x*4+1] = d[1];
4041                         buffer_FragColorbgra8[x*4+2] = d[2];
4042                         buffer_FragColorbgra8[x*4+3] = d[3];
4043                 }
4044         }
4045         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4046 }
4047
4048
4049
4050 void DPSOFTRAST_VertexShader_LightSource(void)
4051 {
4052         int i;
4053         int numvertices = dpsoftrast.numvertices;
4054         float LightPosition[4];
4055         float LightVector[4];
4056         float LightVectorModelSpace[4];
4057         float EyePosition[4];
4058         float EyeVectorModelSpace[4];
4059         float EyeVector[4];
4060         float position[4];
4061         float svector[4];
4062         float tvector[4];
4063         float normal[4];
4064         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4065         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4066         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4067         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4068         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4069         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4070         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4071         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4072         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4073         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4074         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4075         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4076         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4077         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4078         for (i = 0;i < numvertices;i++)
4079         {
4080                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4081                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4082                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4083                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4084                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4085                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4086                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4087                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4088                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4089                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4090                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4091                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4092                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4093                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4094                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4095                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4096                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4097                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4098                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4099                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4100                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4101                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4102                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4103                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4104                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4105                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4106                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4107                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4108                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4109                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4110                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4111                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4112         }
4113         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4114         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4115 }
4116
4117 void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4118 {
4119 #ifdef SSE_POSSIBLE
4120         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4121         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4122         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4123         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4124         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4125         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4126         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4127         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4128         int x, startx = span->startx, endx = span->endx;
4129         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4130         float CubeVectordata[4];
4131         float CubeVectorslope[4];
4132         float LightVectordata[4];
4133         float LightVectorslope[4];
4134         float EyeVectordata[4];
4135         float EyeVectorslope[4];
4136         float z;
4137         float diffusetex[4];
4138         float glosstex[4];
4139         float surfacenormal[4];
4140         float lightnormal[4];
4141         float eyenormal[4];
4142         float specularnormal[4];
4143         float diffuse;
4144         float specular;
4145         float SpecularPower;
4146         float CubeVector[4];
4147         float attenuation;
4148         int d[4];
4149         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4150         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4151         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4152         Color_Glow[3] = 0.0f;
4153         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4154         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4155         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4156         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4157         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4158         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4159         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4160         Color_Diffuse[3] = 0.0f;
4161         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4162         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4163         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4164         Color_Specular[3] = 0.0f;
4165         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4166         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4167         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4168         Color_Pants[3] = 0.0f;
4169         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4170         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4171         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4172         Color_Shirt[3] = 0.0f;
4173         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4174         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4175         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4176         LightColor[3] = 0.0f;
4177         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4178         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4179         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4180         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4181         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4182         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4183         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4184         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4185         {
4186                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4187                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4188         }
4189         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4190                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4191         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4192         {
4193                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4194                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4195                 for (x = startx;x < endx;x++)
4196                 {
4197                         z = buffer_z[x];
4198                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4199                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4200                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4201                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4202                         if (attenuation < 0.01f)
4203                                 continue;
4204                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4205                         {
4206                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4207                                 if (attenuation < 0.01f)
4208                                         continue;
4209                         }
4210
4211                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4212                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4213                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4214                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4215                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4216                         {
4217                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4218                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4219                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4220                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4221                         }
4222                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4223                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4224                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4225                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4226                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4227                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4228                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4229                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4230
4231                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4232                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4233                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4234                         DPSOFTRAST_Vector3Normalize(lightnormal);
4235
4236                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4237
4238                         if(thread->shader_exactspecularmath)
4239                         {
4240                                 // reflect lightnormal at surfacenormal, take the negative of that
4241                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4242                                 float f;
4243                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4244                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4245                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4246                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4247
4248                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4249                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4250                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4251                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4252                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4253
4254                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4255                                 specular = pow(specular, 0.25f + SpecularPower * glosstex[3]);
4256                         }
4257                         else
4258                         {
4259                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4260                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4261                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4262                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4263
4264                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4265                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4266                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4267                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4268
4269                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4270                                 specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4271                         }
4272
4273                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4274                         {
4275                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4276                                 attenuation *= (1.0f / 255.0f);
4277                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4278                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4279                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4280                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4281                         }
4282                         else
4283                         {
4284                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4285                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4286                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4287                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4288                         }
4289                         buffer_FragColorbgra8[x*4+0] = d[0];
4290                         buffer_FragColorbgra8[x*4+1] = d[1];
4291                         buffer_FragColorbgra8[x*4+2] = d[2];
4292                         buffer_FragColorbgra8[x*4+3] = d[3];
4293                 }
4294         }
4295         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4296         {
4297                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4298                 for (x = startx;x < endx;x++)
4299                 {
4300                         z = buffer_z[x];
4301                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4302                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4303                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4304                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4305                         if (attenuation < 0.01f)
4306                                 continue;
4307                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4308                         {
4309                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4310                                 if (attenuation < 0.01f)
4311                                         continue;
4312                         }
4313
4314                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4315                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4316                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4317                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4318                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4319                         {
4320                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4321                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4322                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4323                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4324                         }
4325                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4326                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4327                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4328                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4329
4330                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4331                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4332                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4333                         DPSOFTRAST_Vector3Normalize(lightnormal);
4334
4335                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4336                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4337                         {
4338                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4339                                 attenuation *= (1.0f / 255.0f);
4340                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4341                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4342                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4343                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4344                         }
4345                         else
4346                         {
4347                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4348                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4349                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4350                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4351                         }
4352                         buffer_FragColorbgra8[x*4+0] = d[0];
4353                         buffer_FragColorbgra8[x*4+1] = d[1];
4354                         buffer_FragColorbgra8[x*4+2] = d[2];
4355                         buffer_FragColorbgra8[x*4+3] = d[3];
4356                 }
4357         }
4358         else
4359         {
4360                 for (x = startx;x < endx;x++)
4361                 {
4362                         z = buffer_z[x];
4363                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4364                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4365                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4366                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4367                         if (attenuation < 0.01f)
4368                                 continue;
4369                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4370                         {
4371                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4372                                 if (attenuation < 0.01f)
4373                                         continue;
4374                         }
4375
4376                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4377                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4378                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4379                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4380                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4381                         {
4382                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4383                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4384                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4385                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4386                         }
4387                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4388                         {
4389                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4390                                 attenuation *= (1.0f / 255.0f);
4391                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4392                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4393                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4394                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4395                         }
4396                         else
4397                         {
4398                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4399                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4400                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4401                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4402                         }
4403                         buffer_FragColorbgra8[x*4+0] = d[0];
4404                         buffer_FragColorbgra8[x*4+1] = d[1];
4405                         buffer_FragColorbgra8[x*4+2] = d[2];
4406                         buffer_FragColorbgra8[x*4+3] = d[3];
4407                 }
4408         }
4409         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4410 #endif
4411 }
4412
4413
4414
4415 void DPSOFTRAST_VertexShader_Refraction(void)
4416 {
4417         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4418         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4419         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4420 }
4421
4422 void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4423 {
4424         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4425         float z;
4426         int x, startx = span->startx, endx = span->endx;
4427
4428         // texture reads
4429         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4430         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4431
4432         // varyings
4433         float ModelViewProjectionPositiondata[4];
4434         float ModelViewProjectionPositionslope[4];
4435
4436         // uniforms
4437         float ScreenScaleRefractReflect[2];
4438         float ScreenCenterRefractReflect[2];
4439         float DistortScaleRefractReflect[2];
4440         float RefractColor[4];
4441
4442         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4443         if(!texture) return;
4444
4445         // read textures
4446         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4447         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4448
4449         // read varyings
4450         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4451
4452         // read uniforms
4453         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4454         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4455         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4456         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4457         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4458         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4459         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4460         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4461         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4462         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4463
4464         // do stuff
4465         for (x = startx;x < endx;x++)
4466         {
4467                 float SafeScreenTexCoord[2];
4468                 float ScreenTexCoord[2];
4469                 float v[3];
4470                 float iw;
4471                 unsigned char c[4];
4472
4473                 z = buffer_z[x];
4474
4475                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4476                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4477
4478                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4479                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4480                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4481
4482                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4483                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4484                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4485                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4486                 DPSOFTRAST_Vector3Normalize(v);
4487                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4488                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4489
4490                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4491                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4492
4493                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4494                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4495                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4496                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4497         }
4498
4499         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4500 }
4501
4502
4503
4504 void DPSOFTRAST_VertexShader_Water(void)
4505 {
4506         int i;
4507         int numvertices = dpsoftrast.numvertices;
4508         float EyePosition[4];
4509         float EyeVectorModelSpace[4];
4510         float EyeVector[4];
4511         float position[4];
4512         float svector[4];
4513         float tvector[4];
4514         float normal[4];
4515         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4516         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4517         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4518         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4519         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4520         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4521         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4522         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4523         for (i = 0;i < numvertices;i++)
4524         {
4525                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4526                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4527                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4528                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4529                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4530                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4531                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4532                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4533                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4534                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4535                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4536                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4537                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4538                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4539                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4540                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4541                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4542                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4543                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4544                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4545                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4546                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4547         }
4548         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4549         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4550         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4551 }
4552
4553
4554 void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4555 {
4556         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4557         float z;
4558         int x, startx = span->startx, endx = span->endx;
4559
4560         // texture reads
4561         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4562         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4563
4564         // varyings
4565         float ModelViewProjectionPositiondata[4];
4566         float ModelViewProjectionPositionslope[4];
4567         float EyeVectordata[4];
4568         float EyeVectorslope[4];
4569
4570         // uniforms
4571         float ScreenScaleRefractReflect[4];
4572         float ScreenCenterRefractReflect[4];
4573         float DistortScaleRefractReflect[4];
4574         float RefractColor[4];
4575         float ReflectColor[4];
4576         float ReflectFactor;
4577         float ReflectOffset;
4578
4579         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4580         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4581         if(!texture_refraction || !texture_reflection) return;
4582
4583         // read textures
4584         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4585         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4586
4587         // read varyings
4588         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4589         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4590
4591         // read uniforms
4592         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4593         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4594         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4595         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4596         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4597         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4598         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4599         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4600         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4601         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4602         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4603         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4604         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4605         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4606         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4607         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4608         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4609         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4610         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4611         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4612         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4613         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4614
4615         // do stuff
4616         for (x = startx;x < endx;x++)
4617         {
4618                 float SafeScreenTexCoord[4];
4619                 float ScreenTexCoord[4];
4620                 float v[3];
4621                 float iw;
4622                 unsigned char c1[4];
4623                 unsigned char c2[4];
4624                 float Fresnel;
4625
4626                 z = buffer_z[x];
4627
4628                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4629                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4630
4631                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4632                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4633                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4634                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4635                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4636
4637                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4638                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4639                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4640                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4641                 DPSOFTRAST_Vector3Normalize(v);
4642                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4643                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4644                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4645                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4646
4647                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4648                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4649                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4650                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4651                 DPSOFTRAST_Vector3Normalize(v);
4652                 Fresnel = 1.0f - v[2];
4653                 Fresnel = min(1.0f, Fresnel);
4654                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4655
4656                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4657                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4658                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4659                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4660
4661                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4662                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4663                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4664                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4665         }
4666
4667         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4668 }
4669
4670
4671
4672 void DPSOFTRAST_VertexShader_ShowDepth(void)
4673 {
4674         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4675 }
4676
4677 void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4678 {
4679         // TODO: IMPLEMENT
4680         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4681         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4682         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4683         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4684         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4685 }
4686
4687
4688
4689 void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4690 {
4691         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4692 }
4693
4694 void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4695 {
4696         // TODO: IMPLEMENT
4697         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4698         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4699         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4700         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4701         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4702 }
4703
4704
4705
4706 void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4707 {
4708         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4709 }
4710
4711 void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4712 {
4713         // TODO: IMPLEMENT
4714         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4715         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4716         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4717         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4718         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4719 }
4720
4721
4722
4723 typedef struct DPSOFTRAST_ShaderModeInfo_s
4724 {
4725         int lodarrayindex;
4726         void (*Vertex)(void);
4727         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4728         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4729         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4730 }
4731 DPSOFTRAST_ShaderModeInfo;
4732
4733 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4734 {
4735         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4736         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4737         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4738         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4739         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4740         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4741         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4742         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4743         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4744         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4745         {2, DPSOFTRAST_VertexShader_VertexColor,                        DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4746         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4747         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4748         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4749         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4750         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4751         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4752         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4753 };
4754
4755 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4756 {
4757         int x;
4758         int startx;
4759         int endx;
4760         unsigned int *depthpixel;
4761         int depth;
4762         int depthslope;
4763         unsigned int d;
4764         unsigned char *pixelmask;
4765         DPSOFTRAST_State_Triangle *triangle;
4766         triangle = &thread->triangles[span->triangle];
4767         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4768         startx = span->startx;
4769         endx = span->endx;
4770         depth = span->depthbase;
4771         depthslope = span->depthslope;
4772         pixelmask = thread->pixelmaskarray;
4773         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4774         {
4775                 switch(thread->fb_depthfunc)
4776                 {
4777                 default:
4778                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4779                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4780                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4781                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4782                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4783                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4784                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4785                 }
4786                 while (startx < endx && !pixelmask[startx])
4787                         startx++;
4788                 while (endx > startx && !pixelmask[endx-1])
4789                         endx--;
4790         }
4791         else
4792         {
4793                 // no depth testing means we're just dealing with color...
4794                 memset(pixelmask + startx, 1, endx - startx);
4795         }
4796         span->pixelmask = pixelmask;
4797         span->startx = startx;
4798         span->endx = endx;
4799 }
4800
4801 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4802 {
4803         int x, d, depth, depthslope, startx, endx;
4804         const unsigned char *pixelmask;
4805         unsigned int *depthpixel;
4806         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4807         {
4808                 depth = span->depthbase;
4809                 depthslope = span->depthslope;
4810                 pixelmask = span->pixelmask;
4811                 startx = span->startx;
4812                 endx = span->endx;
4813                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4814                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4815                         if (pixelmask[x])
4816                                 depthpixel[x] = d;
4817         }
4818 }
4819
4820 void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4821 {
4822         int i;
4823         DPSOFTRAST_State_Triangle *triangle;
4824         DPSOFTRAST_State_Span *span;
4825         for (i = 0; i < thread->numspans; i++)
4826         {
4827                 span = &thread->spans[i];
4828                 triangle = &thread->triangles[span->triangle];
4829                 DPSOFTRAST_Draw_DepthTest(thread, span);
4830                 if (span->startx >= span->endx)
4831                         continue;
4832                 // run pixel shader if appropriate
4833                 // do this before running depthmask code, to allow the pixelshader
4834                 // to clear pixelmask values for alpha testing
4835                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4836                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4837                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4838         }
4839         thread->numspans = 0;
4840 }
4841
4842 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;);
4843
4844 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4845 {
4846 #ifdef SSE_POSSIBLE
4847         int cullface = thread->cullface;
4848         int minx, maxx, miny, maxy;
4849         int miny1, maxy1, miny2, maxy2;
4850         __m128i fbmin, fbmax;
4851         __m128 viewportcenter, viewportscale;
4852         int firstvertex = command->firstvertex;
4853         int numvertices = command->numvertices;
4854         int numtriangles = command->numtriangles;
4855         const int *element3i = command->element3i;
4856         const unsigned short *element3s = command->element3s;
4857         int clipped = command->clipped;
4858         int i;
4859         int j;
4860         int k;
4861         int y;
4862         int e[3];
4863         __m128i screeny;
4864         int starty, endy, bandy;
4865         int numpoints;
4866         int clipcase;
4867         float clipdist[4];
4868         float clip0origin, clip0slope;
4869         int clip0dir;
4870         __m128 triangleedge1, triangleedge2, trianglenormal;
4871         __m128 clipfrac[3];
4872         __m128 screen[4];
4873         DPSOFTRAST_State_Triangle *triangle;
4874         DPSOFTRAST_Texture *texture;
4875         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4876         miny = thread->fb_scissor[1];
4877         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4878         miny1 = bound(miny, thread->miny1, maxy);
4879         maxy1 = bound(miny, thread->maxy1, maxy);
4880         miny2 = bound(miny, thread->miny2, maxy);
4881         maxy2 = bound(miny, thread->maxy2, maxy);
4882         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4883         {
4884                 if (!ATOMIC_DECREMENT(command->refcount))
4885                 {
4886                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4887                                 MM_FREE(command->arrays);
4888                 }
4889                 return;
4890         }
4891         minx = thread->fb_scissor[0];
4892         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4893         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4894         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4895         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4896         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4897         screen[3] = _mm_setzero_ps();
4898         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4899         for (i = 0;i < numtriangles;i++)
4900         {
4901                 const float *screencoord4f = command->arrays;
4902                 const float *arrays = screencoord4f + numvertices*4;
4903
4904                 // generate the 3 edges of this triangle
4905                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4906                 if (element3s)
4907                 {
4908                         e[0] = element3s[i*3+0] - firstvertex;
4909                         e[1] = element3s[i*3+1] - firstvertex;
4910                         e[2] = element3s[i*3+2] - firstvertex;
4911                 }
4912                 else if (element3i)
4913                 {
4914                         e[0] = element3i[i*3+0] - firstvertex;
4915                         e[1] = element3i[i*3+1] - firstvertex;
4916                         e[2] = element3i[i*3+2] - firstvertex;
4917                 }
4918                 else
4919                 {
4920                         e[0] = i*3+0;
4921                         e[1] = i*3+1;
4922                         e[2] = i*3+2;
4923                 }
4924
4925 #define SKIPBACKFACE \
4926                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4927                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4928                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4929                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4930                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4931                 switch(cullface) \
4932                 { \
4933                 case GL_BACK: \
4934                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4935                                 continue; \
4936                         break; \
4937                 case GL_FRONT: \
4938                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4939                                 continue; \
4940                         break; \
4941                 }
4942
4943 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4944                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4945                         { \
4946                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4947                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4948                         }
4949 #define CLIPPEDVERTEXCOPY(k,p1) \
4950                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4951
4952 #define GENATTRIBCOPY(attrib, p1) \
4953                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4954 #define GENATTRIBLERP(attrib, p1, p2) \
4955                 { \
4956                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4957                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4958                 }
4959 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4960                 switch(clipcase) \
4961                 { \
4962                 default: \
4963                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4964                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4965                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4966                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4967                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4968                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4969                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4970                 }
4971
4972                 if (! clipped)
4973                         goto notclipped;
4974
4975                 // calculate distance from nearplane
4976                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4977                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4978                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4979                 if (clipdist[0] >= 0.0f)
4980                 {
4981                         if (clipdist[1] >= 0.0f)
4982                         {
4983                                 if (clipdist[2] >= 0.0f)
4984                                 {
4985                                 notclipped:
4986                                         // triangle is entirely in front of nearplane
4987                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
4988                                         SKIPBACKFACE;
4989                                         numpoints = 3;
4990                                         clipcase = 0;
4991                                 }
4992                                 else
4993                                 {
4994                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
4995                                         SKIPBACKFACE;
4996                                         numpoints = 4;
4997                                         clipcase = 1;
4998                                 }
4999                         }
5000                         else
5001                         {
5002                                 if (clipdist[2] >= 0.0f)
5003                                 {
5004                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5005                                         SKIPBACKFACE;
5006                                         numpoints = 4;
5007                                         clipcase = 2;
5008                                 }
5009                                 else
5010                                 {
5011                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5012                                         SKIPBACKFACE;
5013                                         numpoints = 3;
5014                                         clipcase = 3;
5015                                 }
5016                         }
5017                 }
5018                 else if (clipdist[1] >= 0.0f)
5019                 {
5020                         if (clipdist[2] >= 0.0f)
5021                         {
5022                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5023                                 SKIPBACKFACE;
5024                                 numpoints = 4;
5025                                 clipcase = 4;
5026                         }
5027                         else
5028                         {
5029                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5030                                 SKIPBACKFACE;
5031                                 numpoints = 3;
5032                                 clipcase = 5;
5033                         }
5034                 }
5035                 else if (clipdist[2] >= 0.0f)
5036                 {
5037                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5038                         SKIPBACKFACE;
5039                         numpoints = 3;
5040                         clipcase = 6;
5041                 }
5042                 else continue; // triangle is entirely behind nearplane
5043
5044                 {
5045                         // calculate integer y coords for triangle points
5046                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5047                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5048                                         screenmin = _mm_min_epi16(screeni, screenir),
5049                                         screenmax = _mm_max_epi16(screeni, screenir);
5050                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5051                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5052                         screenmin = _mm_max_epi16(screenmin, fbmin);
5053                         screenmax = _mm_min_epi16(screenmax, fbmax);
5054                         // skip offscreen triangles
5055                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5056                                 continue;
5057                         starty = _mm_extract_epi16(screenmin, 1);
5058                         endy = _mm_extract_epi16(screenmax, 1)+1;
5059                         if (starty >= maxy1 && endy <= miny2)
5060                                 continue;
5061                         screeny = _mm_srai_epi32(screeni, 16);
5062                 }
5063
5064                 triangle = &thread->triangles[thread->numtriangles];
5065
5066                 // calculate attribute plans for triangle data...
5067                 // okay, this triangle is going to produce spans, we'd better project
5068                 // the interpolants now (this is what gives perspective texturing),
5069                 // this consists of simply multiplying all arrays by the W coord
5070                 // (which is basically 1/Z), which will be undone per-pixel
5071                 // (multiplying by Z again) to get the perspective-correct array
5072                 // values
5073                 {
5074                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5075                         __m128 mipedgescale, mipdensity;
5076                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5077                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5078                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5079                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5080                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5081                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5082                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5083                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5084                         attribedge1 = _mm_sub_ss(w0, w1);
5085                         attribedge2 = _mm_sub_ss(w2, w1);
5086                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5087                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5088                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5089                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5090                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5091                         _mm_store_ss(&triangle->w[0], attribxslope);
5092                         _mm_store_ss(&triangle->w[1], attribyslope);
5093                         _mm_store_ss(&triangle->w[2], attriborigin);
5094                         
5095                         clip0origin = 0;
5096                         clip0slope = 0;
5097                         clip0dir = 0;
5098                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5099                         {
5100                                 float cliporigin, clipxslope, clipyslope;
5101                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5102                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5103                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5104                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5105                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5106                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5107                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5108                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5109                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5110                                 if(clipxslope != 0)
5111                                 {
5112                                         clip0origin = -cliporigin/clipxslope;
5113                                         clip0slope = -clipyslope/clipxslope;
5114                                         clip0dir = clipxslope > 0 ? 1 : -1;
5115                                 }
5116                                 else if(clipyslope > 0)
5117                                 {
5118                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5119                                         clip0slope = dpsoftrast.fb_width;
5120                                         clip0dir = -1;
5121                                 }
5122                                 else if(clipyslope < 0)
5123                                 {
5124                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5125                                         clip0slope = -dpsoftrast.fb_width;
5126                                         clip0dir = -1;
5127                                 }
5128                                 else if(clip0origin < 0) continue;
5129                         }
5130
5131                         mipedgescale = _mm_setzero_ps();
5132                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5133                         {
5134                                 __m128 attrib0, attrib1, attrib2;
5135                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5136                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5137                                         break;
5138                                 arrays += numvertices*4;
5139                                 GENATTRIBS(attrib0, attrib1, attrib2);
5140                                 attriborigin = _mm_mul_ps(attrib1, w1);
5141                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5142                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5143                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5144                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5145                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5146                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5147                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5148                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5149                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5150                                 {
5151                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5152                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5153                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5154                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5155                                 }
5156                         }
5157
5158                         memset(triangle->mip, 0, sizeof(triangle->mip));
5159                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5160                         {
5161                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5162                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5163                                         break;
5164                                 texture = thread->texbound[texunit];
5165                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5166                                 {
5167                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5168                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5169                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5170                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5171                                         // this will be multiplied in the texturing routine by the texture resolution
5172                                         y = _mm_cvtss_si32(mipdensity);
5173                                         if (y > 0)
5174                                         {
5175                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5176                                                 if (y > texture->mipmaps - 1)
5177                                                         y = texture->mipmaps - 1;
5178                                                 triangle->mip[texunit] = y;
5179                                         }
5180                                 }
5181                         }
5182                 }
5183         
5184                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5185                 for (; y < bandy;)
5186                 {
5187                         __m128 xcoords, xslope;
5188                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5189                         int yccmask = _mm_movemask_epi8(ycc);
5190                         int edge0p, edge0n, edge1p, edge1n;
5191                         int nexty;
5192                         float w, wslope;
5193                         float clip0;
5194                         if (numpoints == 4)
5195                         {
5196                                 switch(yccmask)
5197                                 {
5198                                 default:
5199                                 case 0xFFFF: /*0000*/ y = endy; continue;
5200                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5201                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5202                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5203                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5204                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5205                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5206                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5207                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5208                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5209                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5210                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5211                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5212                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5213                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5214                                 case 0x0000: /*1111*/ y++; continue;
5215                                 }
5216                         }
5217                         else
5218                         {
5219                                 switch(yccmask)
5220                                 {
5221                                 default:
5222                                 case 0xFFFF: /*000*/ y = endy; continue;
5223                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5224                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5225                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5226                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5227                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5228                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5229                                 case 0x0000: /*111*/ y++; continue;
5230                                 }
5231                         }
5232                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5233                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5234                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5235                         nexty = _mm_extract_epi16(ycc, 0);
5236                         if (nexty >= bandy) nexty = bandy-1;
5237                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5238                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5239                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5240                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5241                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5242                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5243                         {
5244                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5245                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5246                         }
5247                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5248                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5249                         {
5250                                 int startx, endx, offset;
5251                                 startx = _mm_cvtss_si32(xcoords);
5252                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5253                                 if (startx < minx) startx = minx;
5254                                 if (endx > maxx) endx = maxx;
5255                                 if (startx >= endx) continue;
5256
5257                                 if (clip0dir)
5258                                 {
5259                                         if (clip0dir > 0)
5260                                         {
5261                                                 if (startx < clip0) 
5262                                                 {
5263                                                         if(endx <= clip0) continue;
5264                                                         startx = (int)clip0;
5265                                                 }
5266                                         }
5267                                         else if (endx > clip0) 
5268                                         {
5269                                                 if(startx >= clip0) continue;
5270                                                 endx = (int)clip0;
5271                                         }
5272                                 }
5273                                                 
5274                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5275                                 {
5276                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5277                                         span->triangle = thread->numtriangles;
5278                                         span->x = offset;
5279                                         span->y = y;
5280                                         span->startx = 0;
5281                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5282                                         if (span->startx >= span->endx)
5283                                                 continue;
5284                                         wslope = triangle->w[0];
5285                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5286                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5287                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5288                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5289                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5290                                 }
5291                         }
5292                 }
5293
5294                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5295                 {
5296                         DPSOFTRAST_Draw_ProcessSpans(thread);
5297                         thread->numtriangles = 0;
5298                 }
5299         }
5300
5301         if (!ATOMIC_DECREMENT(command->refcount))
5302         {
5303                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5304                         MM_FREE(command->arrays);
5305         }
5306
5307         if (thread->numspans > 0 || thread->numtriangles > 0)
5308         {
5309                 DPSOFTRAST_Draw_ProcessSpans(thread);
5310                 thread->numtriangles = 0;
5311         }
5312 #endif
5313 }
5314
5315 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5316 {
5317         int i;
5318         int j;
5319         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5320         int datasize = 2*numvertices*sizeof(float[4]);
5321         DPSOFTRAST_Command_Draw *command;
5322         unsigned char *data;
5323         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5324         {
5325                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5326                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5327                         break;
5328                 datasize += numvertices*sizeof(float[4]);
5329         }
5330         if (element3s)
5331                 datasize += numtriangles*sizeof(unsigned short[3]);
5332         else if (element3i)
5333                 datasize += numtriangles*sizeof(int[3]);
5334         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5335         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5336         {
5337                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5338                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5339         }
5340         else
5341         {
5342                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5343                 data = (unsigned char *)command + commandsize;
5344         }
5345         command->firstvertex = firstvertex;
5346         command->numvertices = numvertices;
5347         command->numtriangles = numtriangles;
5348         command->arrays = (float *)data;
5349         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5350         dpsoftrast.firstvertex = firstvertex;
5351         dpsoftrast.numvertices = numvertices;
5352         dpsoftrast.screencoord4f = (float *)data;
5353         data += numvertices*sizeof(float[4]);
5354         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5355         data += numvertices*sizeof(float[4]);
5356         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5357         {
5358                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5359                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5360                         break;
5361                 dpsoftrast.post_array4f[j] = (float *)data;
5362                 data += numvertices*sizeof(float[4]);
5363         }
5364         command->element3i = NULL;
5365         command->element3s = NULL;
5366         if (element3s)
5367         {
5368                 command->element3s = (unsigned short *)data;
5369                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5370         }
5371         else if (element3i)
5372         {
5373                 command->element3i = (int *)data;
5374                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5375         }
5376         return command;
5377 }
5378
5379 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5380 {
5381         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5382         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5383         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5384         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5385         if (command->starty >= command->endy)
5386         {
5387                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5388                         MM_FREE(command->arrays);
5389                 DPSOFTRAST_UndoCommand(command->commandsize);
5390                 return;
5391         }
5392         command->clipped = dpsoftrast.drawclipped;
5393         command->refcount = dpsoftrast.numthreads;
5394
5395         if (dpsoftrast.usethreads)
5396         {
5397                 int i;
5398                 DPSOFTRAST_Draw_SyncCommands();
5399                 for (i = 0; i < dpsoftrast.numthreads; i++)
5400                 {
5401                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5402                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5403                                 Thread_CondSignal(thread->drawcond);
5404                 }
5405         }
5406         else
5407         {
5408                 DPSOFTRAST_Draw_FlushThreads();
5409         }
5410 }
5411
5412 DEFCOMMAND(23, SetRenderTargets, int width; int height;);
5413 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5414 {
5415         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5416 }
5417 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5418 {
5419         DPSOFTRAST_Command_SetRenderTargets *command;
5420         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5421                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5422                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5423                 DPSOFTRAST_Flush();
5424         dpsoftrast.fb_width = width;
5425         dpsoftrast.fb_height = height;
5426         dpsoftrast.fb_depthpixels = depthpixels;
5427         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5428         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5429         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5430         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5431         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5432         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5433         command->width = width;
5434         command->height = height;
5435 }
5436  
5437 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5438 {
5439         int commandoffset = thread->commandoffset;
5440         while (commandoffset != endoffset)
5441         {
5442                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5443                 switch (command->opcode)
5444                 {
5445 #define INTERPCOMMAND(name) \
5446                 case DPSOFTRAST_OPCODE_##name : \
5447                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5448                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5449                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5450                                 commandoffset = 0; \
5451                         break;
5452                 INTERPCOMMAND(Viewport)
5453                 INTERPCOMMAND(ClearColor)
5454                 INTERPCOMMAND(ClearDepth)
5455                 INTERPCOMMAND(ColorMask)
5456                 INTERPCOMMAND(DepthTest)
5457                 INTERPCOMMAND(ScissorTest)
5458                 INTERPCOMMAND(Scissor)
5459                 INTERPCOMMAND(BlendFunc)
5460                 INTERPCOMMAND(BlendSubtract)
5461                 INTERPCOMMAND(DepthMask)
5462                 INTERPCOMMAND(DepthFunc)
5463                 INTERPCOMMAND(DepthRange)
5464                 INTERPCOMMAND(PolygonOffset)
5465                 INTERPCOMMAND(CullFace)
5466                 INTERPCOMMAND(SetTexture)
5467                 INTERPCOMMAND(SetShader)
5468                 INTERPCOMMAND(Uniform4f)
5469                 INTERPCOMMAND(UniformMatrix4f)
5470                 INTERPCOMMAND(Uniform1i)
5471                 INTERPCOMMAND(SetRenderTargets)
5472                 INTERPCOMMAND(ClipPlane)
5473
5474                 case DPSOFTRAST_OPCODE_Draw:
5475                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5476                         commandoffset += command->commandsize;
5477                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5478                                 commandoffset = 0;
5479                         thread->commandoffset = commandoffset;
5480                         break;
5481
5482                 case DPSOFTRAST_OPCODE_Reset:
5483                         commandoffset = 0;
5484                         break;
5485                 }
5486         }
5487         thread->commandoffset = commandoffset;
5488 }
5489
5490 static int DPSOFTRAST_Draw_Thread(void *data)
5491 {
5492         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5493         while(thread->index >= 0)
5494         {
5495                 if (thread->commandoffset != dpsoftrast.drawcommand)
5496                 {
5497                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5498                 }
5499                 else 
5500                 {
5501                         Thread_LockMutex(thread->drawmutex);
5502                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5503                         {
5504                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5505                                 thread->starving = true;
5506                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5507                                 thread->starving = false;
5508                         }
5509                         Thread_UnlockMutex(thread->drawmutex);
5510                 }
5511         }   
5512         return 0;
5513 }
5514
5515 static void DPSOFTRAST_Draw_FlushThreads(void)
5516 {
5517         DPSOFTRAST_State_Thread *thread;
5518         int i;
5519         DPSOFTRAST_Draw_SyncCommands();
5520         if (dpsoftrast.usethreads) 
5521         {
5522                 for (i = 0; i < dpsoftrast.numthreads; i++)
5523                 {
5524                         thread = &dpsoftrast.threads[i];
5525                         if (thread->commandoffset != dpsoftrast.drawcommand)
5526                         {
5527                                 Thread_LockMutex(thread->drawmutex);
5528                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5529                                         Thread_CondSignal(thread->drawcond);
5530                                 Thread_UnlockMutex(thread->drawmutex);
5531                         }
5532                 }
5533                 for (i = 0; i < dpsoftrast.numthreads; i++)
5534                 {
5535                         thread = &dpsoftrast.threads[i];
5536                         if (thread->commandoffset != dpsoftrast.drawcommand)
5537                         {
5538                                 Thread_LockMutex(thread->drawmutex);
5539                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5540                                 {
5541                                         thread->waiting = true;
5542                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5543                                         thread->waiting = false;
5544                                 }
5545                                 Thread_UnlockMutex(thread->drawmutex);
5546                         }
5547                 }
5548         }
5549         else
5550         {
5551                 for (i = 0; i < dpsoftrast.numthreads; i++)
5552                 {
5553                         thread = &dpsoftrast.threads[i];
5554                         if (thread->commandoffset != dpsoftrast.drawcommand)
5555                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5556                 }
5557         }
5558         dpsoftrast.commandpool.usedcommands = 0;
5559 }
5560
5561 void DPSOFTRAST_Flush(void)
5562 {
5563         DPSOFTRAST_Draw_FlushThreads();
5564 }
5565
5566 void DPSOFTRAST_Finish(void)
5567 {
5568         DPSOFTRAST_Flush();
5569 }
5570
5571 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5572 {
5573         int i;
5574         union
5575         {
5576                 int i;
5577                 unsigned char b[4];
5578         }
5579         u;
5580         u.i = 1;
5581         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5582         dpsoftrast.bigendian = u.b[3];
5583         dpsoftrast.fb_width = width;
5584         dpsoftrast.fb_height = height;
5585         dpsoftrast.fb_depthpixels = depthpixels;
5586         dpsoftrast.fb_colorpixels[0] = colorpixels;
5587         dpsoftrast.fb_colorpixels[1] = NULL;
5588         dpsoftrast.fb_colorpixels[1] = NULL;
5589         dpsoftrast.fb_colorpixels[1] = NULL;
5590         dpsoftrast.viewport[0] = 0;
5591         dpsoftrast.viewport[1] = 0;
5592         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5593         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5594         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5595         dpsoftrast.texture_firstfree = 1;
5596         dpsoftrast.texture_end = 1;
5597         dpsoftrast.texture_max = 0;
5598         dpsoftrast.color[0] = 1;
5599         dpsoftrast.color[1] = 1;
5600         dpsoftrast.color[2] = 1;
5601         dpsoftrast.color[3] = 1;
5602         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5603         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5604         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5605         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5606         for (i = 0; i < dpsoftrast.numthreads; i++)
5607         {
5608                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5609                 thread->index = i;
5610                 thread->cullface = GL_BACK;
5611         thread->colormask[0] = 1; 
5612                 thread->colormask[1] = 1;
5613                 thread->colormask[2] = 1;
5614                 thread->colormask[3] = 1;
5615                 thread->blendfunc[0] = GL_ONE;
5616                 thread->blendfunc[1] = GL_ZERO;
5617                 thread->depthmask = true;
5618                 thread->depthtest = true;
5619                 thread->depthfunc = GL_LEQUAL;
5620                 thread->scissortest = false;
5621                 thread->viewport[0] = 0;
5622                 thread->viewport[1] = 0;
5623                 thread->viewport[2] = dpsoftrast.fb_width;
5624                 thread->viewport[3] = dpsoftrast.fb_height;
5625                 thread->scissor[0] = 0;
5626                 thread->scissor[1] = 0;
5627                 thread->scissor[2] = dpsoftrast.fb_width;
5628                 thread->scissor[3] = dpsoftrast.fb_height;
5629                 thread->depthrange[0] = 0;
5630                 thread->depthrange[1] = 1;
5631                 thread->polygonoffset[0] = 0;
5632                 thread->polygonoffset[1] = 0;
5633                 thread->clipplane[0] = 0;
5634                 thread->clipplane[1] = 0;
5635                 thread->clipplane[2] = 0;
5636                 thread->clipplane[3] = 1;
5637         
5638                 thread->numspans = 0;
5639                 thread->numtriangles = 0;
5640                 thread->commandoffset = 0;
5641                 thread->waiting = false;
5642                 thread->starving = false;
5643            
5644                 thread->validate = -1;
5645                 DPSOFTRAST_Validate(thread, -1);
5646  
5647                 if (dpsoftrast.usethreads)
5648                 {
5649                         thread->waitcond = Thread_CreateCond();
5650                         thread->drawcond = Thread_CreateCond();
5651                         thread->drawmutex = Thread_CreateMutex();
5652                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5653                 }
5654         }
5655         return 0;
5656 }
5657
5658 void DPSOFTRAST_Shutdown(void)
5659 {
5660         int i;
5661         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5662         {
5663                 DPSOFTRAST_State_Thread *thread;
5664                 for (i = 0; i < dpsoftrast.numthreads; i++)
5665                 {
5666                         thread = &dpsoftrast.threads[i];
5667                         Thread_LockMutex(thread->drawmutex);
5668                         thread->index = -1;
5669                         Thread_CondSignal(thread->drawcond);
5670                         Thread_UnlockMutex(thread->drawmutex);
5671                         Thread_WaitThread(thread->thread, 0);
5672                         Thread_DestroyCond(thread->waitcond);
5673                         Thread_DestroyCond(thread->drawcond);
5674                         Thread_DestroyMutex(thread->drawmutex);
5675                 }
5676         }
5677         for (i = 0;i < dpsoftrast.texture_end;i++)
5678                 if (dpsoftrast.texture[i].bytes)
5679                         MM_FREE(dpsoftrast.texture[i].bytes);
5680         if (dpsoftrast.texture)
5681                 free(dpsoftrast.texture);
5682         if (dpsoftrast.threads)
5683                 MM_FREE(dpsoftrast.threads);
5684         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5685 }
5686