]> de.git.xonotic.org Git - xonotic/darkplaces.git/blob - dpsoftrast.c
Add THREADDISABLE define to make builds with whole threading system disabled.
[xonotic/darkplaces.git] / dpsoftrast.c
1 #include <stdio.h>
2 #include <string.h>
3 #define _USE_MATH_DEFINES
4 #include <math.h>
5 #include "quakedef.h"
6 #include "thread.h"
7 #include "dpsoftrast.h"
8
9 #ifdef _MSC_VER
10 #pragma warning(disable : 4324)
11 #endif
12
13 #ifndef __cplusplus
14 typedef qboolean bool;
15 #endif
16
17 #define ALIGN_SIZE 16
18 #define ATOMIC_SIZE 4
19
20 #ifdef SSE_POSSIBLE
21         #if defined(__APPLE__)
22                 #include <libkern/OSAtomic.h>
23                 #define ALIGN(var) var __attribute__((__aligned__(16)))
24                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
25                 #define MEMORY_BARRIER (_mm_sfence())
26                 #define ATOMIC_COUNTER volatile int32_t 
27                 #define ATOMIC_INCREMENT(counter) (OSAtomicIncrement32Barrier(&(counter)))
28                 #define ATOMIC_DECREMENT(counter) (OSAtomicDecrement32Barrier(&(counter)))
29                 #define ATOMIC_ADD(counter, val) ((void)OSAtomicAdd32Barrier((val), &(counter)))
30         #elif defined(__GNUC__) && defined(WIN32)
31                 #define ALIGN(var) var __attribute__((__aligned__(16)))
32                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
33                 #define MEMORY_BARRIER (_mm_sfence())
34                 //(__sync_synchronize())
35                 #define ATOMIC_COUNTER volatile LONG
36                 // this LONG * cast serves to fix an issue with broken mingw
37                 // packages on Ubuntu; these only declare the function to take
38                 // a LONG *, causing a compile error here. This seems to be
39                 // error- and warn-free on platforms that DO declare
40                 // InterlockedIncrement correctly, like mingw on Windows.
41                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement((LONG *) &(counter)))
42                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement((LONG *) &(counter)))
43                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd((LONG *) &(counter), (val)))
44         #elif defined(__GNUC__)
45                 #define ALIGN(var) var __attribute__((__aligned__(16)))
46                 #define ATOMIC(var) var __attribute__((__aligned__(4)))
47                 #define MEMORY_BARRIER (_mm_sfence())
48                 //(__sync_synchronize())
49                 #define ATOMIC_COUNTER volatile int
50                 #define ATOMIC_INCREMENT(counter) (__sync_add_and_fetch(&(counter), 1))
51                 #define ATOMIC_DECREMENT(counter) (__sync_add_and_fetch(&(counter), -1))
52                 #define ATOMIC_ADD(counter, val) ((void)__sync_fetch_and_add(&(counter), (val)))
53         #elif defined(_MSC_VER)
54                 #define ALIGN(var) __declspec(align(16)) var
55                 #define ATOMIC(var) __declspec(align(4)) var
56                 #define MEMORY_BARRIER (_mm_sfence())
57                 //(MemoryBarrier())
58                 #define ATOMIC_COUNTER volatile LONG
59                 #define ATOMIC_INCREMENT(counter) (InterlockedIncrement(&(counter)))
60                 #define ATOMIC_DECREMENT(counter) (InterlockedDecrement(&(counter)))
61                 #define ATOMIC_ADD(counter, val) ((void)InterlockedExchangeAdd(&(counter), (val)))
62         #endif
63 #endif
64
65 #ifndef ALIGN
66 #define ALIGN(var) var
67 #endif
68 #ifndef ATOMIC
69 #define ATOMIC(var) var
70 #endif
71 #ifndef MEMORY_BARRIER
72 #define MEMORY_BARRIER ((void)0)
73 #endif
74 #ifndef ATOMIC_COUNTER
75 #define ATOMIC_COUNTER int
76 #endif
77 #ifndef ATOMIC_INCREMENT
78 #define ATOMIC_INCREMENT(counter) (++(counter))
79 #endif
80 #ifndef ATOMIC_DECREMENT
81 #define ATOMIC_DECREMENT(counter) (--(counter))
82 #endif
83 #ifndef ATOMIC_ADD
84 #define ATOMIC_ADD(counter, val) ((void)((counter) += (val)))
85 #endif
86
87 #ifdef SSE_POSSIBLE
88 #include <emmintrin.h>
89
90 #if defined(__GNUC__) && (__GNUC < 4 || __GNUC_MINOR__ < 6) && !defined(__clang__)
91         #define _mm_cvtss_f32(val) (__builtin_ia32_vec_ext_v4sf ((__v4sf)(val), 0))
92 #endif
93
94 #define MM_MALLOC(size) _mm_malloc(size, ALIGN_SIZE)
95
96 static void *MM_CALLOC(size_t nmemb, size_t size)
97 {
98         void *ptr = _mm_malloc(nmemb*size, ALIGN_SIZE);
99         if (ptr != NULL) memset(ptr, 0, nmemb*size);
100         return ptr;
101 }
102
103 #define MM_FREE _mm_free
104 #else
105 #define MM_MALLOC(size) malloc(size)
106 #define MM_CALLOC(nmemb, size) calloc(nmemb, size)
107 #define MM_FREE free
108 #endif
109
110 typedef enum DPSOFTRAST_ARRAY_e
111 {
112         DPSOFTRAST_ARRAY_POSITION,
113         DPSOFTRAST_ARRAY_COLOR,
114         DPSOFTRAST_ARRAY_TEXCOORD0,
115         DPSOFTRAST_ARRAY_TEXCOORD1,
116         DPSOFTRAST_ARRAY_TEXCOORD2,
117         DPSOFTRAST_ARRAY_TEXCOORD3,
118         DPSOFTRAST_ARRAY_TEXCOORD4,
119         DPSOFTRAST_ARRAY_TEXCOORD5,
120         DPSOFTRAST_ARRAY_TEXCOORD6,
121         DPSOFTRAST_ARRAY_TEXCOORD7,
122         DPSOFTRAST_ARRAY_TOTAL
123 }
124 DPSOFTRAST_ARRAY;
125
126 typedef struct DPSOFTRAST_Texture_s
127 {
128         int flags;
129         int width;
130         int height;
131         int depth;
132         int sides;
133         DPSOFTRAST_TEXTURE_FILTER filter;
134         int mipmaps;
135         int size;
136         ATOMIC_COUNTER binds;
137         unsigned char *bytes;
138         int mipmap[DPSOFTRAST_MAXMIPMAPS][5];
139 }
140 DPSOFTRAST_Texture;
141
142 #define COMMAND_SIZE ALIGN_SIZE
143 #define COMMAND_ALIGN(var) ALIGN(var)
144
145 typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_s
146 {
147         unsigned char opcode;
148         unsigned short commandsize;
149 }
150 DPSOFTRAST_Command);
151
152 enum { DPSOFTRAST_OPCODE_Reset = 0 };
153
154 #define DEFCOMMAND(opcodeval, name, fields) \
155         enum { DPSOFTRAST_OPCODE_##name = opcodeval }; \
156         typedef COMMAND_ALIGN(struct DPSOFTRAST_Command_##name##_s \
157         { \
158                 unsigned char opcode; \
159                 unsigned short commandsize; \
160                 fields \
161         } DPSOFTRAST_Command_##name );
162
163 #define DPSOFTRAST_DRAW_MAXCOMMANDPOOL 2097152
164 #define DPSOFTRAST_DRAW_MAXCOMMANDSIZE 16384
165
166 typedef ALIGN(struct DPSOFTRAST_State_Command_Pool_s
167 {
168         int freecommand;
169         int usedcommands;
170         ALIGN(unsigned char commands[DPSOFTRAST_DRAW_MAXCOMMANDPOOL]);
171 }
172 DPSOFTRAST_State_Command_Pool);
173
174 typedef ALIGN(struct DPSOFTRAST_State_Triangle_s
175 {
176         unsigned char mip[DPSOFTRAST_MAXTEXTUREUNITS]; // texcoord to screen space density values (for picking mipmap of textures)
177         float w[3];
178         ALIGN(float attribs[DPSOFTRAST_ARRAY_TOTAL][3][4]);
179 }
180 DPSOFTRAST_State_Triangle);
181
182 #define DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex) { \
183         slope = _mm_load_ps((triangle)->attribs[arrayindex][0]); \
184         data = _mm_add_ps(_mm_load_ps((triangle)->attribs[arrayindex][2]), \
185                                         _mm_add_ps(_mm_mul_ps(_mm_set1_ps((span)->x), slope), \
186                                                                 _mm_mul_ps(_mm_set1_ps((span)->y), _mm_load_ps((triangle)->attribs[arrayindex][1])))); \
187 }
188 #define DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex) { \
189         slope[0] = (triangle)->attribs[arrayindex][0][0]; \
190         slope[1] = (triangle)->attribs[arrayindex][0][1]; \
191         slope[2] = (triangle)->attribs[arrayindex][0][2]; \
192         slope[3] = (triangle)->attribs[arrayindex][0][3]; \
193         data[0] = (triangle)->attribs[arrayindex][2][0] + (span->x)*slope[0] + (span->y)*(triangle)->attribs[arrayindex][1][0]; \
194         data[1] = (triangle)->attribs[arrayindex][2][1] + (span->x)*slope[1] + (span->y)*(triangle)->attribs[arrayindex][1][1]; \
195         data[2] = (triangle)->attribs[arrayindex][2][2] + (span->x)*slope[2] + (span->y)*(triangle)->attribs[arrayindex][1][2]; \
196         data[3] = (triangle)->attribs[arrayindex][2][3] + (span->x)*slope[3] + (span->y)*(triangle)->attribs[arrayindex][1][3]; \
197 }
198                                         
199 #define DPSOFTRAST_DRAW_MAXSUBSPAN 16
200
201 typedef ALIGN(struct DPSOFTRAST_State_Span_s
202 {
203         int triangle; // triangle this span was generated by
204         int x; // framebuffer x coord
205         int y; // framebuffer y coord
206         int startx; // usable range (according to pixelmask)
207         int endx; // usable range (according to pixelmask)
208         unsigned char *pixelmask; // true for pixels that passed depth test, false for others
209         int depthbase; // depthbuffer value at x (add depthslope*startx to get first pixel's depthbuffer value)
210         int depthslope; // depthbuffer value pixel delta
211 }
212 DPSOFTRAST_State_Span);
213
214 #define DPSOFTRAST_DRAW_MAXSPANS 1024
215 #define DPSOFTRAST_DRAW_MAXTRIANGLES 128
216 #define DPSOFTRAST_DRAW_MAXSPANLENGTH 256
217
218 #define DPSOFTRAST_VALIDATE_FB 1
219 #define DPSOFTRAST_VALIDATE_DEPTHFUNC 2
220 #define DPSOFTRAST_VALIDATE_BLENDFUNC 4
221 #define DPSOFTRAST_VALIDATE_DRAW (DPSOFTRAST_VALIDATE_FB | DPSOFTRAST_VALIDATE_DEPTHFUNC | DPSOFTRAST_VALIDATE_BLENDFUNC)
222
223 typedef enum DPSOFTRAST_BLENDMODE_e
224 {
225         DPSOFTRAST_BLENDMODE_OPAQUE,
226         DPSOFTRAST_BLENDMODE_ALPHA,
227         DPSOFTRAST_BLENDMODE_ADDALPHA,
228         DPSOFTRAST_BLENDMODE_ADD,
229         DPSOFTRAST_BLENDMODE_INVMOD,
230         DPSOFTRAST_BLENDMODE_MUL,
231         DPSOFTRAST_BLENDMODE_MUL2,
232         DPSOFTRAST_BLENDMODE_SUBALPHA,
233         DPSOFTRAST_BLENDMODE_PSEUDOALPHA,
234         DPSOFTRAST_BLENDMODE_INVADD,
235         DPSOFTRAST_BLENDMODE_TOTAL
236 }
237 DPSOFTRAST_BLENDMODE;
238
239 typedef ALIGN(struct DPSOFTRAST_State_Thread_s
240 {
241         void *thread;
242         int index;
243         
244         int cullface;
245         int colormask[4];
246         int blendfunc[2];
247         int blendsubtract;
248         int depthmask;
249         int depthtest;
250         int depthfunc;
251         int scissortest;
252         int viewport[4];
253         int scissor[4];
254         float depthrange[2];
255         float polygonoffset[2];
256         float clipplane[4];
257         ALIGN(float fb_clipplane[4]);
258
259         int shader_mode;
260         int shader_permutation;
261         int shader_exactspecularmath;
262
263         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
264         
265         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
266         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
267
268         // DPSOFTRAST_VALIDATE_ flags
269         int validate;
270
271         // derived values (DPSOFTRAST_VALIDATE_FB)
272         int fb_colormask;
273         int fb_scissor[4];
274         ALIGN(float fb_viewportcenter[4]);
275         ALIGN(float fb_viewportscale[4]);
276
277         // derived values (DPSOFTRAST_VALIDATE_DEPTHFUNC)
278         int fb_depthfunc;
279
280         // derived values (DPSOFTRAST_VALIDATE_BLENDFUNC)
281         int fb_blendmode;
282
283         // band boundaries
284         int miny1;
285         int maxy1;
286         int miny2;
287         int maxy2;
288
289         ATOMIC(volatile int commandoffset);
290
291         volatile bool waiting;
292         volatile bool starving;
293         void *waitcond;
294         void *drawcond;
295         void *drawmutex;
296
297         int numspans;
298         int numtriangles;
299         DPSOFTRAST_State_Span spans[DPSOFTRAST_DRAW_MAXSPANS];
300         DPSOFTRAST_State_Triangle triangles[DPSOFTRAST_DRAW_MAXTRIANGLES];
301         unsigned char pixelmaskarray[DPSOFTRAST_DRAW_MAXSPANLENGTH+4]; // LordHavoc: padded to allow some termination bytes
302 }
303 DPSOFTRAST_State_Thread);
304
305 typedef ALIGN(struct DPSOFTRAST_State_s
306 {
307         int fb_width;
308         int fb_height;
309         unsigned int *fb_depthpixels;
310         unsigned int *fb_colorpixels[4];
311
312         int viewport[4];
313         ALIGN(float fb_viewportcenter[4]);
314         ALIGN(float fb_viewportscale[4]);
315
316         float color[4];
317         ALIGN(float uniform4f[DPSOFTRAST_UNIFORM_TOTAL*4]);
318         int uniform1i[DPSOFTRAST_UNIFORM_TOTAL];
319
320         const float *pointer_vertex3f;
321         const float *pointer_color4f;
322         const unsigned char *pointer_color4ub;
323         const float *pointer_texcoordf[DPSOFTRAST_MAXTEXCOORDARRAYS];
324         int stride_vertex;
325         int stride_color;
326         int stride_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
327         int components_texcoord[DPSOFTRAST_MAXTEXCOORDARRAYS];
328         DPSOFTRAST_Texture *texbound[DPSOFTRAST_MAXTEXTUREUNITS];
329
330         int firstvertex;
331         int numvertices;
332         float *post_array4f[DPSOFTRAST_ARRAY_TOTAL];
333         float *screencoord4f;
334         int drawstarty;
335         int drawendy;
336         int drawclipped;
337         
338         int shader_mode;
339         int shader_permutation;
340         int shader_exactspecularmath;
341
342         int texture_max;
343         int texture_end;
344         int texture_firstfree;
345         DPSOFTRAST_Texture *texture;
346
347         int bigendian;
348
349         // error reporting
350         const char *errorstring;
351
352         bool usethreads;
353         int interlace;
354         int numthreads;
355         DPSOFTRAST_State_Thread *threads;
356
357         ATOMIC(volatile int drawcommand);
358
359         DPSOFTRAST_State_Command_Pool commandpool;
360 }
361 DPSOFTRAST_State);
362
363 DPSOFTRAST_State dpsoftrast;
364
365 #define DPSOFTRAST_DEPTHSCALE (1024.0f*1048576.0f)
366 #define DPSOFTRAST_DEPTHOFFSET (128.0f)
367 #define DPSOFTRAST_BGRA8_FROM_RGBA32F(r,g,b,a) (((int)(r * 255.0f + 0.5f) << 16) | ((int)(g * 255.0f + 0.5f) << 8) | (int)(b * 255.0f + 0.5f) | ((int)(a * 255.0f + 0.5f) << 24))
368 #define DPSOFTRAST_DEPTH32_FROM_DEPTH32F(d) ((int)(DPSOFTRAST_DEPTHSCALE * (1-d)))
369
370 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span);
371 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span);
372
373 static void DPSOFTRAST_RecalcViewport(const int *viewport, float *fb_viewportcenter, float *fb_viewportscale)
374 {
375         fb_viewportcenter[1] = viewport[0] + 0.5f * viewport[2] - 0.5f;
376         fb_viewportcenter[2] = dpsoftrast.fb_height - viewport[1] - 0.5f * viewport[3] - 0.5f;
377         fb_viewportcenter[3] = 0.5f;
378         fb_viewportcenter[0] = 0.0f;
379         fb_viewportscale[1] = 0.5f * viewport[2];
380         fb_viewportscale[2] = -0.5f * viewport[3];
381         fb_viewportscale[3] = 0.5f;
382         fb_viewportscale[0] = 1.0f;
383 }
384
385 static void DPSOFTRAST_RecalcThread(DPSOFTRAST_State_Thread *thread)
386 {
387         if (dpsoftrast.interlace)
388         {
389                 thread->miny1 = (thread->index*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
390                 thread->maxy1 = ((thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
391                 thread->miny2 = ((dpsoftrast.numthreads+thread->index)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
392                 thread->maxy2 = ((dpsoftrast.numthreads+thread->index+1)*dpsoftrast.fb_height)/(2*dpsoftrast.numthreads);
393         }
394         else
395         {
396                 thread->miny1 = thread->miny2 = (thread->index*dpsoftrast.fb_height)/dpsoftrast.numthreads;
397                 thread->maxy1 = thread->maxy2 = ((thread->index+1)*dpsoftrast.fb_height)/dpsoftrast.numthreads;
398         }
399 }
400
401 static void DPSOFTRAST_RecalcClipPlane(DPSOFTRAST_State_Thread *thread)
402 {
403         thread->fb_clipplane[0] = thread->clipplane[0] / thread->fb_viewportscale[1];
404         thread->fb_clipplane[1] = thread->clipplane[1] / thread->fb_viewportscale[2];
405         thread->fb_clipplane[2] = thread->clipplane[2] / thread->fb_viewportscale[3];
406         thread->fb_clipplane[3] = thread->clipplane[3] / thread->fb_viewportscale[0];
407         thread->fb_clipplane[3] -= thread->fb_viewportcenter[1]*thread->fb_clipplane[0] + thread->fb_viewportcenter[2]*thread->fb_clipplane[1] + thread->fb_viewportcenter[3]*thread->fb_clipplane[2] + thread->fb_viewportcenter[0]*thread->fb_clipplane[3];
408 }
409
410 static void DPSOFTRAST_RecalcFB(DPSOFTRAST_State_Thread *thread)
411 {
412         // calculate framebuffer scissor, viewport, viewport clipped by scissor,
413         // and viewport projection values
414         int x1, x2;
415         int y1, y2;
416         x1 = thread->scissor[0];
417         x2 = thread->scissor[0] + thread->scissor[2];
418         y1 = dpsoftrast.fb_height - thread->scissor[1] - thread->scissor[3];
419         y2 = dpsoftrast.fb_height - thread->scissor[1];
420         if (!thread->scissortest) {x1 = 0;y1 = 0;x2 = dpsoftrast.fb_width;y2 = dpsoftrast.fb_height;}
421         if (x1 < 0) x1 = 0;
422         if (x2 > dpsoftrast.fb_width) x2 = dpsoftrast.fb_width;
423         if (y1 < 0) y1 = 0;
424         if (y2 > dpsoftrast.fb_height) y2 = dpsoftrast.fb_height;
425         thread->fb_scissor[0] = x1;
426         thread->fb_scissor[1] = y1;
427         thread->fb_scissor[2] = x2 - x1;
428         thread->fb_scissor[3] = y2 - y1;
429
430         DPSOFTRAST_RecalcViewport(thread->viewport, thread->fb_viewportcenter, thread->fb_viewportscale);
431         DPSOFTRAST_RecalcClipPlane(thread);
432         DPSOFTRAST_RecalcThread(thread);
433 }
434
435 static void DPSOFTRAST_RecalcDepthFunc(DPSOFTRAST_State_Thread *thread)
436 {
437         thread->fb_depthfunc = thread->depthtest ? thread->depthfunc : GL_ALWAYS;
438 }
439
440 static void DPSOFTRAST_RecalcBlendFunc(DPSOFTRAST_State_Thread *thread)
441 {
442         if (thread->blendsubtract)
443         {
444                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
445                 {
446                 #define BLENDFUNC(sfactor, dfactor, blendmode) \
447                         case (sfactor<<16)|dfactor: thread->fb_blendmode = blendmode; break;
448                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_SUBALPHA)
449                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
450                 }
451         }
452         else
453         {       
454                 switch ((thread->blendfunc[0]<<16)|thread->blendfunc[1])
455                 {
456                 BLENDFUNC(GL_ONE, GL_ZERO, DPSOFTRAST_BLENDMODE_OPAQUE)
457                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_ALPHA)
458                 BLENDFUNC(GL_SRC_ALPHA, GL_ONE, DPSOFTRAST_BLENDMODE_ADDALPHA)
459                 BLENDFUNC(GL_ONE, GL_ONE, DPSOFTRAST_BLENDMODE_ADD)
460                 BLENDFUNC(GL_ZERO, GL_ONE_MINUS_SRC_COLOR, DPSOFTRAST_BLENDMODE_INVMOD)
461                 BLENDFUNC(GL_ZERO, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL)
462                 BLENDFUNC(GL_DST_COLOR, GL_ZERO, DPSOFTRAST_BLENDMODE_MUL)
463                 BLENDFUNC(GL_DST_COLOR, GL_SRC_COLOR, DPSOFTRAST_BLENDMODE_MUL2)
464                 BLENDFUNC(GL_ONE, GL_ONE_MINUS_SRC_ALPHA, DPSOFTRAST_BLENDMODE_PSEUDOALPHA)
465                 BLENDFUNC(GL_ONE_MINUS_DST_COLOR, GL_ONE, DPSOFTRAST_BLENDMODE_INVADD)
466                 default: thread->fb_blendmode = DPSOFTRAST_BLENDMODE_OPAQUE; break;
467                 }
468         }
469 }
470
471 #define DPSOFTRAST_ValidateQuick(thread, f) ((thread->validate & (f)) ? (DPSOFTRAST_Validate(thread, f), 0) : 0)
472
473 static void DPSOFTRAST_Validate(DPSOFTRAST_State_Thread *thread, int mask)
474 {
475         mask &= thread->validate;
476         if (!mask)
477                 return;
478         if (mask & DPSOFTRAST_VALIDATE_FB)
479         {
480                 thread->validate &= ~DPSOFTRAST_VALIDATE_FB;
481                 DPSOFTRAST_RecalcFB(thread);
482         }
483         if (mask & DPSOFTRAST_VALIDATE_DEPTHFUNC)
484         {
485                 thread->validate &= ~DPSOFTRAST_VALIDATE_DEPTHFUNC;
486                 DPSOFTRAST_RecalcDepthFunc(thread);
487         }
488         if (mask & DPSOFTRAST_VALIDATE_BLENDFUNC)
489         {
490                 thread->validate &= ~DPSOFTRAST_VALIDATE_BLENDFUNC;
491                 DPSOFTRAST_RecalcBlendFunc(thread);
492         }
493 }
494
495 static DPSOFTRAST_Texture *DPSOFTRAST_Texture_GetByIndex(int index)
496 {
497         if (index >= 1 && index < dpsoftrast.texture_end && dpsoftrast.texture[index].bytes)
498                 return &dpsoftrast.texture[index];
499         return NULL;
500 }
501
502 static void DPSOFTRAST_Texture_Grow(void)
503 {
504         DPSOFTRAST_Texture *oldtexture = dpsoftrast.texture;
505         DPSOFTRAST_State_Thread *thread;
506         int i;
507         int j;
508         DPSOFTRAST_Flush();
509         // expand texture array as needed
510         if (dpsoftrast.texture_max < 1024)
511                 dpsoftrast.texture_max = 1024;
512         else
513                 dpsoftrast.texture_max *= 2;
514         dpsoftrast.texture = (DPSOFTRAST_Texture *)realloc(dpsoftrast.texture, dpsoftrast.texture_max * sizeof(DPSOFTRAST_Texture));
515         for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
516                 if (dpsoftrast.texbound[i])
517                         dpsoftrast.texbound[i] = dpsoftrast.texture + (dpsoftrast.texbound[i] - oldtexture);
518         for (j = 0; j < dpsoftrast.numthreads; j++)
519         {
520                 thread = &dpsoftrast.threads[j];
521                 for (i = 0; i < DPSOFTRAST_MAXTEXTUREUNITS; i++)
522                         if (thread->texbound[i])
523                                 thread->texbound[i] = dpsoftrast.texture + (thread->texbound[i] - oldtexture);
524         }
525 }
526
527 int DPSOFTRAST_Texture_New(int flags, int width, int height, int depth)
528 {
529         int w;
530         int h;
531         int d;
532         int size;
533         int s;
534         int texnum;
535         int mipmaps;
536         int sides = (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) ? 6 : 1;
537         int texformat = flags & DPSOFTRAST_TEXTURE_FORMAT_COMPAREMASK;
538         DPSOFTRAST_Texture *texture;
539         if (width*height*depth < 1)
540         {
541                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: width, height or depth is less than 1";
542                 return 0;
543         }
544         if (width > DPSOFTRAST_TEXTURE_MAXSIZE || height > DPSOFTRAST_TEXTURE_MAXSIZE || depth > DPSOFTRAST_TEXTURE_MAXSIZE)
545         {
546                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: texture size is too large";
547                 return 0;
548         }
549         switch(texformat)
550         {
551         case DPSOFTRAST_TEXTURE_FORMAT_BGRA8:
552         case DPSOFTRAST_TEXTURE_FORMAT_RGBA8:
553         case DPSOFTRAST_TEXTURE_FORMAT_ALPHA8:
554                 break;
555         case DPSOFTRAST_TEXTURE_FORMAT_DEPTH:
556                 if (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP)
557                 {
558                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
559                         return 0;
560                 }
561                 if (depth != 1)
562                 {
563                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH only permitted on 2D textures";
564                         return 0;
565                 }
566                 if ((flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && (texformat == DPSOFTRAST_TEXTURE_FORMAT_DEPTH))
567                 {
568                         dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FORMAT_DEPTH does not permit mipmaps";
569                         return 0;
570                 }
571                 break;
572         }
573         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP))
574         {
575                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_CUBEMAP can not be used on 3D textures";
576                 return 0;
577         }
578         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
579         {
580                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
581                 return 0;
582         }
583         if (depth != 1 && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
584         {
585                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on 3D textures";
586                 return 0;
587         }
588         if ((flags & DPSOFTRAST_TEXTURE_FLAG_CUBEMAP) && (flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
589         {
590                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: DPSOFTRAST_TEXTURE_FLAG_MIPMAP can not be used on cubemap textures";
591                 return 0;
592         }
593         if ((width & (width-1)) || (height & (height-1)) || (depth & (depth-1)))
594         {
595                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_New: dimensions are not power of two";
596                 return 0;
597         }
598         // find first empty slot in texture array
599         for (texnum = dpsoftrast.texture_firstfree;texnum < dpsoftrast.texture_end;texnum++)
600                 if (!dpsoftrast.texture[texnum].bytes)
601                         break;
602         dpsoftrast.texture_firstfree = texnum + 1;
603         if (dpsoftrast.texture_max <= texnum)
604                 DPSOFTRAST_Texture_Grow();
605         if (dpsoftrast.texture_end <= texnum)
606                 dpsoftrast.texture_end = texnum + 1;
607         texture = &dpsoftrast.texture[texnum];
608         memset(texture, 0, sizeof(*texture));
609         texture->flags = flags;
610         texture->width = width;
611         texture->height = height;
612         texture->depth = depth;
613         texture->sides = sides;
614         texture->binds = 0;
615         w = width;
616         h = height;
617         d = depth;
618         size = 0;
619         mipmaps = 0;
620         w = width;
621         h = height;
622         d = depth;
623         for (;;)
624         {
625                 s = w * h * d * sides * 4;
626                 texture->mipmap[mipmaps][0] = size;
627                 texture->mipmap[mipmaps][1] = s;
628                 texture->mipmap[mipmaps][2] = w;
629                 texture->mipmap[mipmaps][3] = h;
630                 texture->mipmap[mipmaps][4] = d;
631                 size += s;
632                 mipmaps++;
633                 if (w * h * d == 1 || !(flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP))
634                         break;
635                 if (w > 1) w >>= 1;
636                 if (h > 1) h >>= 1;
637                 if (d > 1) d >>= 1;
638         }
639         texture->mipmaps = mipmaps;
640         texture->size = size;
641
642         // allocate the pixels now
643         texture->bytes = (unsigned char *)MM_CALLOC(1, size);
644
645         return texnum;
646 }
647 void DPSOFTRAST_Texture_Free(int index)
648 {
649         DPSOFTRAST_Texture *texture;
650         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
651         if (texture->binds)
652                 DPSOFTRAST_Flush();
653         if (texture->bytes)
654                 MM_FREE(texture->bytes);
655         texture->bytes = NULL;
656         memset(texture, 0, sizeof(*texture));
657         // adjust the free range and used range
658         if (dpsoftrast.texture_firstfree > index)
659                 dpsoftrast.texture_firstfree = index;
660         while (dpsoftrast.texture_end > 0 && dpsoftrast.texture[dpsoftrast.texture_end-1].bytes == NULL)
661                 dpsoftrast.texture_end--;
662 }
663 static void DPSOFTRAST_Texture_CalculateMipmaps(int index)
664 {
665         int i, x, y, z, w, layer0, layer1, row0, row1;
666         unsigned char *o, *i0, *i1, *i2, *i3;
667         DPSOFTRAST_Texture *texture;
668         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
669         if (texture->mipmaps <= 1)
670                 return;
671         for (i = 1;i < texture->mipmaps;i++)
672         {
673                 for (z = 0;z < texture->mipmap[i][4];z++)
674                 {
675                         layer0 = z*2;
676                         layer1 = z*2+1;
677                         if (layer1 >= texture->mipmap[i-1][4])
678                                 layer1 = texture->mipmap[i-1][4]-1;
679                         for (y = 0;y < texture->mipmap[i][3];y++)
680                         {
681                                 row0 = y*2;
682                                 row1 = y*2+1;
683                                 if (row1 >= texture->mipmap[i-1][3])
684                                         row1 = texture->mipmap[i-1][3]-1;
685                                 o =  texture->bytes + texture->mipmap[i  ][0] + 4*((texture->mipmap[i  ][3] * z      + y   ) * texture->mipmap[i  ][2]);
686                                 i0 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row0) * texture->mipmap[i-1][2]);
687                                 i1 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer0 + row1) * texture->mipmap[i-1][2]);
688                                 i2 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row0) * texture->mipmap[i-1][2]);
689                                 i3 = texture->bytes + texture->mipmap[i-1][0] + 4*((texture->mipmap[i-1][3] * layer1 + row1) * texture->mipmap[i-1][2]);
690                                 w = texture->mipmap[i][2];
691                                 if (layer1 > layer0)
692                                 {
693                                         if (texture->mipmap[i-1][2] > 1)
694                                         {
695                                                 // average 3D texture
696                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8, i2 += 8, i3 += 8)
697                                                 {
698                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + i2[0] + i2[4] + i3[0] + i3[4] + 4) >> 3;
699                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + i2[1] + i2[5] + i3[1] + i3[5] + 4) >> 3;
700                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + i2[2] + i2[6] + i3[2] + i3[6] + 4) >> 3;
701                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + i2[3] + i2[7] + i3[3] + i3[7] + 4) >> 3;
702                                                 }
703                                         }
704                                         else
705                                         {
706                                                 // average 3D mipmap with parent width == 1
707                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
708                                                 {
709                                                         o[0] = (i0[0] + i1[0] + i2[0] + i3[0] + 2) >> 2;
710                                                         o[1] = (i0[1] + i1[1] + i2[1] + i3[1] + 2) >> 2;
711                                                         o[2] = (i0[2] + i1[2] + i2[2] + i3[2] + 2) >> 2;
712                                                         o[3] = (i0[3] + i1[3] + i2[3] + i3[3] + 2) >> 2;
713                                                 }
714                                         }
715                                 }
716                                 else
717                                 {
718                                         if (texture->mipmap[i-1][2] > 1)
719                                         {
720                                                 // average 2D texture (common case)
721                                                 for (x = 0;x < w;x++, o += 4, i0 += 8, i1 += 8)
722                                                 {
723                                                         o[0] = (i0[0] + i0[4] + i1[0] + i1[4] + 2) >> 2;
724                                                         o[1] = (i0[1] + i0[5] + i1[1] + i1[5] + 2) >> 2;
725                                                         o[2] = (i0[2] + i0[6] + i1[2] + i1[6] + 2) >> 2;
726                                                         o[3] = (i0[3] + i0[7] + i1[3] + i1[7] + 2) >> 2;
727                                                 }
728                                         }
729                                         else
730                                         {
731                                                 // 2D texture with parent width == 1
732                                                 o[0] = (i0[0] + i1[0] + 1) >> 1;
733                                                 o[1] = (i0[1] + i1[1] + 1) >> 1;
734                                                 o[2] = (i0[2] + i1[2] + 1) >> 1;
735                                                 o[3] = (i0[3] + i1[3] + 1) >> 1;
736                                         }
737                                 }
738                         }
739                 }
740         }
741 }
742 void DPSOFTRAST_Texture_UpdatePartial(int index, int mip, const unsigned char *pixels, int blockx, int blocky, int blockwidth, int blockheight)
743 {
744         DPSOFTRAST_Texture *texture;
745         unsigned char *dst;
746         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
747         if (texture->binds)
748                 DPSOFTRAST_Flush();
749         if (pixels)
750         {
751                 dst = texture->bytes + (blocky * texture->mipmap[0][2] + blockx) * 4;
752                 while (blockheight > 0)
753                 {
754                         memcpy(dst, pixels, blockwidth * 4);
755                         pixels += blockwidth * 4;
756                         dst += texture->mipmap[0][2] * 4;
757                         blockheight--;
758                 }
759         }
760         DPSOFTRAST_Texture_CalculateMipmaps(index);
761 }
762 void DPSOFTRAST_Texture_UpdateFull(int index, const unsigned char *pixels)
763 {
764         DPSOFTRAST_Texture *texture;
765         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
766         if (texture->binds)
767                 DPSOFTRAST_Flush();
768         if (pixels)
769                 memcpy(texture->bytes, pixels, texture->mipmap[0][1]);
770         DPSOFTRAST_Texture_CalculateMipmaps(index);
771 }
772 int DPSOFTRAST_Texture_GetWidth(int index, int mip)
773 {
774         DPSOFTRAST_Texture *texture;
775         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
776         return texture->mipmap[mip][2];
777 }
778 int DPSOFTRAST_Texture_GetHeight(int index, int mip)
779 {
780         DPSOFTRAST_Texture *texture;
781         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
782         return texture->mipmap[mip][3];
783 }
784 int DPSOFTRAST_Texture_GetDepth(int index, int mip)
785 {
786         DPSOFTRAST_Texture *texture;
787         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
788         return texture->mipmap[mip][4];
789 }
790 unsigned char *DPSOFTRAST_Texture_GetPixelPointer(int index, int mip)
791 {
792         DPSOFTRAST_Texture *texture;
793         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return 0;
794         if (texture->binds)
795                 DPSOFTRAST_Flush();
796         return texture->bytes + texture->mipmap[mip][0];
797 }
798 void DPSOFTRAST_Texture_Filter(int index, DPSOFTRAST_TEXTURE_FILTER filter)
799 {
800         DPSOFTRAST_Texture *texture;
801         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
802         if (!(texture->flags & DPSOFTRAST_TEXTURE_FLAG_MIPMAP) && filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
803         {
804                 dpsoftrast.errorstring = "DPSOFTRAST_Texture_Filter: requested filter mode requires mipmaps";
805                 return;
806         }
807         if (texture->binds)
808                 DPSOFTRAST_Flush();
809         texture->filter = filter;
810 }
811
812 static void DPSOFTRAST_Draw_FlushThreads(void);
813
814 static void DPSOFTRAST_Draw_SyncCommands(void)
815 {
816         if(dpsoftrast.usethreads) MEMORY_BARRIER;
817         dpsoftrast.drawcommand = dpsoftrast.commandpool.freecommand;
818 }
819
820 static void DPSOFTRAST_Draw_FreeCommandPool(int space)
821 {
822         DPSOFTRAST_State_Thread *thread;
823         int i;
824         int freecommand = dpsoftrast.commandpool.freecommand;
825         int usedcommands = dpsoftrast.commandpool.usedcommands;
826         if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space)
827                 return;
828         DPSOFTRAST_Draw_SyncCommands();
829         for(;;)
830         {
831                 int waitindex = -1;
832                 int commandoffset;
833                 usedcommands = 0;
834                 for (i = 0; i < dpsoftrast.numthreads; i++)
835                 {
836                         thread = &dpsoftrast.threads[i]; 
837                         commandoffset = freecommand - thread->commandoffset;
838                         if (commandoffset < 0)
839                                 commandoffset += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
840                         if (commandoffset > usedcommands)
841                         {
842                                 waitindex = i;
843                                 usedcommands = commandoffset;
844                         }
845                 }
846                 if (usedcommands <= DPSOFTRAST_DRAW_MAXCOMMANDPOOL-space || waitindex < 0)
847                         break;
848                 thread = &dpsoftrast.threads[waitindex];
849                 Thread_LockMutex(thread->drawmutex);
850                 if (thread->commandoffset != dpsoftrast.drawcommand)
851                 {
852                         thread->waiting = true;
853                         if (thread->starving) Thread_CondSignal(thread->drawcond);
854                         Thread_CondWait(thread->waitcond, thread->drawmutex);
855                         thread->waiting = false;
856                 }
857                 Thread_UnlockMutex(thread->drawmutex);
858         }
859         dpsoftrast.commandpool.usedcommands = usedcommands;
860 }
861
862 #define DPSOFTRAST_ALIGNCOMMAND(size) \
863         ((size) + ((COMMAND_SIZE - ((size)&(COMMAND_SIZE-1))) & (COMMAND_SIZE-1)))
864 #define DPSOFTRAST_ALLOCATECOMMAND(name) \
865         ((DPSOFTRAST_Command_##name *) DPSOFTRAST_AllocateCommand( DPSOFTRAST_OPCODE_##name , DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name ))))
866
867 static void *DPSOFTRAST_AllocateCommand(int opcode, int size)
868 {
869         DPSOFTRAST_Command *command;
870         int freecommand = dpsoftrast.commandpool.freecommand;
871         int usedcommands = dpsoftrast.commandpool.usedcommands;
872         int extra = sizeof(DPSOFTRAST_Command);
873         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
874                 extra += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
875         if (usedcommands > DPSOFTRAST_DRAW_MAXCOMMANDPOOL - (size + extra))
876         {
877                 if (dpsoftrast.usethreads)
878                         DPSOFTRAST_Draw_FreeCommandPool(size + extra);
879                 else
880                         DPSOFTRAST_Draw_FlushThreads();
881                 freecommand = dpsoftrast.commandpool.freecommand;
882                 usedcommands = dpsoftrast.commandpool.usedcommands;
883         }
884         if (DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand < size)
885         {
886                 command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
887                 command->opcode = DPSOFTRAST_OPCODE_Reset;
888                 usedcommands += DPSOFTRAST_DRAW_MAXCOMMANDPOOL - freecommand;
889                 freecommand = 0;
890         }
891         command = (DPSOFTRAST_Command *) &dpsoftrast.commandpool.commands[freecommand];
892         command->opcode = opcode;
893         command->commandsize = size;
894         freecommand += size;
895         if (freecommand >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
896                 freecommand = 0;
897         dpsoftrast.commandpool.freecommand = freecommand;
898         dpsoftrast.commandpool.usedcommands = usedcommands + size;
899         return command;
900 }
901
902 static void DPSOFTRAST_UndoCommand(int size)
903 {
904         int freecommand = dpsoftrast.commandpool.freecommand;
905         int usedcommands = dpsoftrast.commandpool.usedcommands;
906         freecommand -= size;
907         if (freecommand < 0)
908                 freecommand += DPSOFTRAST_DRAW_MAXCOMMANDPOOL;
909         usedcommands -= size;
910         dpsoftrast.commandpool.freecommand = freecommand;
911         dpsoftrast.commandpool.usedcommands = usedcommands;
912 }
913                 
914 DEFCOMMAND(1, Viewport, int x; int y; int width; int height;)
915 static void DPSOFTRAST_Interpret_Viewport(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_Viewport *command)
916 {
917         thread->viewport[0] = command->x;
918         thread->viewport[1] = command->y;
919         thread->viewport[2] = command->width;
920         thread->viewport[3] = command->height;
921         thread->validate |= DPSOFTRAST_VALIDATE_FB;
922 }
923 void DPSOFTRAST_Viewport(int x, int y, int width, int height)
924 {
925         DPSOFTRAST_Command_Viewport *command = DPSOFTRAST_ALLOCATECOMMAND(Viewport);
926         command->x = x;
927         command->y = y;
928         command->width = width;
929         command->height = height;
930
931         dpsoftrast.viewport[0] = x;
932         dpsoftrast.viewport[1] = y;
933         dpsoftrast.viewport[2] = width;
934         dpsoftrast.viewport[3] = height;
935         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
936 }
937
938 DEFCOMMAND(2, ClearColor, float r; float g; float b; float a;) 
939 static void DPSOFTRAST_Interpret_ClearColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_ClearColor *command)
940 {
941         int i, x1, y1, x2, y2, w, h, x, y;
942         int miny1, maxy1, miny2, maxy2;
943         int bandy;
944         unsigned int *p;
945         unsigned int c;
946         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
947         miny1 = thread->miny1;
948         maxy1 = thread->maxy1;
949         miny2 = thread->miny2;
950         maxy2 = thread->maxy2;
951         x1 = thread->fb_scissor[0];
952         y1 = thread->fb_scissor[1];
953         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
954         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
955         if (y1 < miny1) y1 = miny1;
956         if (y2 > maxy2) y2 = maxy2;
957         w = x2 - x1;
958         h = y2 - y1;
959         if (w < 1 || h < 1)
960                 return;
961         // FIXME: honor fb_colormask?
962         c = DPSOFTRAST_BGRA8_FROM_RGBA32F(command->r,command->g,command->b,command->a);
963         for (i = 0;i < 4;i++)
964         {
965                 if (!dpsoftrast.fb_colorpixels[i])
966                         continue;
967                 for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
968                 for (;y < bandy;y++)
969                 {
970                         p = dpsoftrast.fb_colorpixels[i] + y * dpsoftrast.fb_width;
971                         for (x = x1;x < x2;x++)
972                                 p[x] = c;
973                 }
974         }
975 }
976 void DPSOFTRAST_ClearColor(float r, float g, float b, float a)
977 {
978         DPSOFTRAST_Command_ClearColor *command = DPSOFTRAST_ALLOCATECOMMAND(ClearColor);
979         command->r = r;
980         command->g = g;
981         command->b = b;
982         command->a = a;
983 }
984
985 DEFCOMMAND(3, ClearDepth, float depth;)
986 static void DPSOFTRAST_Interpret_ClearDepth(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClearDepth *command)
987 {
988         int x1, y1, x2, y2, w, h, x, y;
989         int miny1, maxy1, miny2, maxy2;
990         int bandy;
991         unsigned int *p;
992         unsigned int c;
993         DPSOFTRAST_Validate(thread, DPSOFTRAST_VALIDATE_FB);
994         miny1 = thread->miny1;
995         maxy1 = thread->maxy1;
996         miny2 = thread->miny2;
997         maxy2 = thread->maxy2;
998         x1 = thread->fb_scissor[0];
999         y1 = thread->fb_scissor[1];
1000         x2 = thread->fb_scissor[0] + thread->fb_scissor[2];
1001         y2 = thread->fb_scissor[1] + thread->fb_scissor[3];
1002         if (y1 < miny1) y1 = miny1;
1003         if (y2 > maxy2) y2 = maxy2;
1004         w = x2 - x1;
1005         h = y2 - y1;
1006         if (w < 1 || h < 1)
1007                 return;
1008         c = DPSOFTRAST_DEPTH32_FROM_DEPTH32F(command->depth);
1009         for (y = y1, bandy = min(y2, maxy1); y < y2; bandy = min(y2, maxy2), y = max(y, miny2))
1010         for (;y < bandy;y++)
1011         {
1012                 p = dpsoftrast.fb_depthpixels + y * dpsoftrast.fb_width;
1013                 for (x = x1;x < x2;x++)
1014                         p[x] = c;
1015         }
1016 }
1017 void DPSOFTRAST_ClearDepth(float d)
1018 {
1019         DPSOFTRAST_Command_ClearDepth *command = DPSOFTRAST_ALLOCATECOMMAND(ClearDepth);
1020         command->depth = d;
1021 }
1022
1023 DEFCOMMAND(4, ColorMask, int r; int g; int b; int a;)
1024 static void DPSOFTRAST_Interpret_ColorMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ColorMask *command)
1025 {
1026         thread->colormask[0] = command->r != 0;
1027         thread->colormask[1] = command->g != 0;
1028         thread->colormask[2] = command->b != 0;
1029         thread->colormask[3] = command->a != 0;
1030         thread->fb_colormask = ((-thread->colormask[0]) & 0x00FF0000) | ((-thread->colormask[1]) & 0x0000FF00) | ((-thread->colormask[2]) & 0x000000FF) | ((-thread->colormask[3]) & 0xFF000000);
1031 }
1032 void DPSOFTRAST_ColorMask(int r, int g, int b, int a)
1033 {
1034         DPSOFTRAST_Command_ColorMask *command = DPSOFTRAST_ALLOCATECOMMAND(ColorMask);
1035         command->r = r;
1036         command->g = g;
1037         command->b = b;
1038         command->a = a;
1039 }
1040
1041 DEFCOMMAND(5, DepthTest, int enable;)
1042 static void DPSOFTRAST_Interpret_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthTest *command)
1043 {
1044         thread->depthtest = command->enable;
1045         thread->validate |= DPSOFTRAST_VALIDATE_DEPTHFUNC;
1046 }
1047 void DPSOFTRAST_DepthTest(int enable)
1048 {
1049         DPSOFTRAST_Command_DepthTest *command = DPSOFTRAST_ALLOCATECOMMAND(DepthTest);
1050         command->enable = enable;
1051 }
1052
1053 DEFCOMMAND(6, ScissorTest, int enable;)
1054 static void DPSOFTRAST_Interpret_ScissorTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ScissorTest *command)
1055 {
1056         thread->scissortest = command->enable;
1057         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1058 }
1059 void DPSOFTRAST_ScissorTest(int enable)
1060 {
1061         DPSOFTRAST_Command_ScissorTest *command = DPSOFTRAST_ALLOCATECOMMAND(ScissorTest);
1062         command->enable = enable;
1063 }
1064
1065 DEFCOMMAND(7, Scissor, float x; float y; float width; float height;)
1066 static void DPSOFTRAST_Interpret_Scissor(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Scissor *command)
1067 {
1068         thread->scissor[0] = command->x;
1069         thread->scissor[1] = command->y;
1070         thread->scissor[2] = command->width;
1071         thread->scissor[3] = command->height;
1072         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1073 }
1074 void DPSOFTRAST_Scissor(float x, float y, float width, float height)
1075 {
1076         DPSOFTRAST_Command_Scissor *command = DPSOFTRAST_ALLOCATECOMMAND(Scissor);
1077         command->x = x;
1078         command->y = y;
1079         command->width = width;
1080         command->height = height;
1081 }
1082
1083 DEFCOMMAND(8, BlendFunc, int sfactor; int dfactor;)
1084 static void DPSOFTRAST_Interpret_BlendFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendFunc *command)
1085 {
1086         thread->blendfunc[0] = command->sfactor;
1087         thread->blendfunc[1] = command->dfactor;
1088         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1089 }
1090 void DPSOFTRAST_BlendFunc(int sfactor, int dfactor)
1091 {
1092         DPSOFTRAST_Command_BlendFunc *command = DPSOFTRAST_ALLOCATECOMMAND(BlendFunc);
1093         command->sfactor = sfactor;
1094         command->dfactor = dfactor;
1095 }
1096
1097 DEFCOMMAND(9, BlendSubtract, int enable;)
1098 static void DPSOFTRAST_Interpret_BlendSubtract(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_BlendSubtract *command)
1099 {
1100         thread->blendsubtract = command->enable;
1101         thread->validate |= DPSOFTRAST_VALIDATE_BLENDFUNC;
1102 }
1103 void DPSOFTRAST_BlendSubtract(int enable)
1104 {
1105         DPSOFTRAST_Command_BlendSubtract *command = DPSOFTRAST_ALLOCATECOMMAND(BlendSubtract);
1106         command->enable = enable;
1107 }
1108
1109 DEFCOMMAND(10, DepthMask, int enable;)
1110 static void DPSOFTRAST_Interpret_DepthMask(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthMask *command)
1111 {
1112         thread->depthmask = command->enable;
1113 }
1114 void DPSOFTRAST_DepthMask(int enable)
1115 {
1116         DPSOFTRAST_Command_DepthMask *command = DPSOFTRAST_ALLOCATECOMMAND(DepthMask);
1117         command->enable = enable;
1118 }
1119
1120 DEFCOMMAND(11, DepthFunc, int func;)
1121 static void DPSOFTRAST_Interpret_DepthFunc(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthFunc *command)
1122 {
1123         thread->depthfunc = command->func;
1124 }
1125 void DPSOFTRAST_DepthFunc(int func)
1126 {
1127         DPSOFTRAST_Command_DepthFunc *command = DPSOFTRAST_ALLOCATECOMMAND(DepthFunc);
1128         command->func = func;
1129 }
1130
1131 DEFCOMMAND(12, DepthRange, float nearval; float farval;)
1132 static void DPSOFTRAST_Interpret_DepthRange(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_DepthRange *command)
1133 {
1134         thread->depthrange[0] = command->nearval;
1135         thread->depthrange[1] = command->farval;
1136 }
1137 void DPSOFTRAST_DepthRange(float nearval, float farval)
1138 {
1139         DPSOFTRAST_Command_DepthRange *command = DPSOFTRAST_ALLOCATECOMMAND(DepthRange);
1140         command->nearval = nearval;
1141         command->farval = farval;
1142 }
1143
1144 DEFCOMMAND(13, PolygonOffset, float alongnormal; float intoview;)
1145 static void DPSOFTRAST_Interpret_PolygonOffset(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_PolygonOffset *command)
1146 {
1147         thread->polygonoffset[0] = command->alongnormal;
1148         thread->polygonoffset[1] = command->intoview;
1149 }
1150 void DPSOFTRAST_PolygonOffset(float alongnormal, float intoview)
1151 {
1152         DPSOFTRAST_Command_PolygonOffset *command = DPSOFTRAST_ALLOCATECOMMAND(PolygonOffset);
1153         command->alongnormal = alongnormal;
1154         command->intoview = intoview;
1155 }
1156
1157 DEFCOMMAND(14, CullFace, int mode;)
1158 static void DPSOFTRAST_Interpret_CullFace(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_CullFace *command)
1159 {
1160         thread->cullface = command->mode;
1161 }
1162 void DPSOFTRAST_CullFace(int mode)
1163 {
1164         DPSOFTRAST_Command_CullFace *command = DPSOFTRAST_ALLOCATECOMMAND(CullFace);
1165         command->mode = mode;
1166 }
1167
1168 void DPSOFTRAST_Color4f(float r, float g, float b, float a)
1169 {
1170         dpsoftrast.color[0] = r;
1171         dpsoftrast.color[1] = g;
1172         dpsoftrast.color[2] = b;
1173         dpsoftrast.color[3] = a;
1174 }
1175
1176 void DPSOFTRAST_GetPixelsBGRA(int blockx, int blocky, int blockwidth, int blockheight, unsigned char *outpixels)
1177 {
1178         int outstride = blockwidth * 4;
1179         int instride = dpsoftrast.fb_width * 4;
1180         int bx1 = blockx;
1181         int by1 = blocky;
1182         int bx2 = blockx + blockwidth;
1183         int by2 = blocky + blockheight;
1184         int bw;
1185         int x;
1186         int y;
1187         unsigned char *inpixels;
1188         unsigned char *b;
1189         unsigned char *o;
1190         DPSOFTRAST_Flush();
1191         if (bx1 < 0) bx1 = 0;
1192         if (by1 < 0) by1 = 0;
1193         if (bx2 > dpsoftrast.fb_width) bx2 = dpsoftrast.fb_width;
1194         if (by2 > dpsoftrast.fb_height) by2 = dpsoftrast.fb_height;
1195         bw = bx2 - bx1;
1196         inpixels = (unsigned char *)dpsoftrast.fb_colorpixels[0];
1197         if (dpsoftrast.bigendian)
1198         {
1199                 for (y = by1;y < by2;y++)
1200                 {
1201                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1202                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1203                         for (x = bx1;x < bx2;x++)
1204                         {
1205                                 o[0] = b[3];
1206                                 o[1] = b[2];
1207                                 o[2] = b[1];
1208                                 o[3] = b[0];
1209                                 o += 4;
1210                                 b += 4;
1211                         }
1212                 }
1213         }
1214         else
1215         {
1216                 for (y = by1;y < by2;y++)
1217                 {
1218                         b = (unsigned char *)inpixels + (dpsoftrast.fb_height - 1 - y) * instride + 4 * bx1;
1219                         o = (unsigned char *)outpixels + (y - by1) * outstride;
1220                         memcpy(o, b, bw*4);
1221                 }
1222         }
1223
1224 }
1225 void DPSOFTRAST_CopyRectangleToTexture(int index, int mip, int tx, int ty, int sx, int sy, int width, int height)
1226 {
1227         int tx1 = tx;
1228         int ty1 = ty;
1229         int tx2 = tx + width;
1230         int ty2 = ty + height;
1231         int sx1 = sx;
1232         int sy1 = sy;
1233         int sx2 = sx + width;
1234         int sy2 = sy + height;
1235         int swidth;
1236         int sheight;
1237         int twidth;
1238         int theight;
1239         int sw;
1240         int sh;
1241         int tw;
1242         int th;
1243         int y;
1244         unsigned int *spixels;
1245         unsigned int *tpixels;
1246         DPSOFTRAST_Texture *texture;
1247         texture = DPSOFTRAST_Texture_GetByIndex(index);if (!texture) return;
1248         if (mip < 0 || mip >= texture->mipmaps) return;
1249         DPSOFTRAST_Flush();
1250         spixels = dpsoftrast.fb_colorpixels[0];
1251         swidth = dpsoftrast.fb_width;
1252         sheight = dpsoftrast.fb_height;
1253         tpixels = (unsigned int *)(texture->bytes + texture->mipmap[mip][0]);
1254         twidth = texture->mipmap[mip][2];
1255         theight = texture->mipmap[mip][3];
1256         if (tx1 < 0) tx1 = 0;
1257         if (ty1 < 0) ty1 = 0;
1258         if (tx2 > twidth) tx2 = twidth;
1259         if (ty2 > theight) ty2 = theight;
1260         if (sx1 < 0) sx1 = 0;
1261         if (sy1 < 0) sy1 = 0;
1262         if (sx2 > swidth) sx2 = swidth;
1263         if (sy2 > sheight) sy2 = sheight;
1264         tw = tx2 - tx1;
1265         th = ty2 - ty1;
1266         sw = sx2 - sx1;
1267         sh = sy2 - sy1;
1268         if (tw > sw) tw = sw;
1269         if (th > sh) th = sh;
1270         if (tw < 1 || th < 1)
1271                 return;
1272         sy1 = sheight - 1 - sy1;
1273         for (y = 0;y < th;y++)
1274                 memcpy(tpixels + ((ty1 + y) * twidth + tx1), spixels + ((sy1 - y) * swidth + sx1), tw*4);
1275         if (texture->mipmaps > 1)
1276                 DPSOFTRAST_Texture_CalculateMipmaps(index);
1277 }
1278
1279 DEFCOMMAND(17, SetTexture, int unitnum; DPSOFTRAST_Texture *texture;)
1280 static void DPSOFTRAST_Interpret_SetTexture(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetTexture *command)
1281 {
1282         if (thread->texbound[command->unitnum])
1283                 ATOMIC_DECREMENT(thread->texbound[command->unitnum]->binds);
1284         thread->texbound[command->unitnum] = command->texture;
1285 }
1286 void DPSOFTRAST_SetTexture(int unitnum, int index)
1287 {
1288         DPSOFTRAST_Command_SetTexture *command;
1289         DPSOFTRAST_Texture *texture;
1290         if (unitnum < 0 || unitnum >= DPSOFTRAST_MAXTEXTUREUNITS)
1291         {
1292                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid unit number";
1293                 return;
1294         }
1295         texture = DPSOFTRAST_Texture_GetByIndex(index);
1296         if (index && !texture)
1297         {
1298                 dpsoftrast.errorstring = "DPSOFTRAST_SetTexture: invalid texture handle";
1299                 return;
1300         }
1301
1302         command = DPSOFTRAST_ALLOCATECOMMAND(SetTexture);
1303         command->unitnum = unitnum;
1304         command->texture = texture;
1305
1306         dpsoftrast.texbound[unitnum] = texture;
1307         ATOMIC_ADD(texture->binds, dpsoftrast.numthreads);
1308 }
1309
1310 void DPSOFTRAST_SetVertexPointer(const float *vertex3f, size_t stride)
1311 {
1312         dpsoftrast.pointer_vertex3f = vertex3f;
1313         dpsoftrast.stride_vertex = stride;
1314 }
1315 void DPSOFTRAST_SetColorPointer(const float *color4f, size_t stride)
1316 {
1317         dpsoftrast.pointer_color4f = color4f;
1318         dpsoftrast.pointer_color4ub = NULL;
1319         dpsoftrast.stride_color = stride;
1320 }
1321 void DPSOFTRAST_SetColorPointer4ub(const unsigned char *color4ub, size_t stride)
1322 {
1323         dpsoftrast.pointer_color4f = NULL;
1324         dpsoftrast.pointer_color4ub = color4ub;
1325         dpsoftrast.stride_color = stride;
1326 }
1327 void DPSOFTRAST_SetTexCoordPointer(int unitnum, int numcomponents, size_t stride, const float *texcoordf)
1328 {
1329         dpsoftrast.pointer_texcoordf[unitnum] = texcoordf;
1330         dpsoftrast.components_texcoord[unitnum] = numcomponents;
1331         dpsoftrast.stride_texcoord[unitnum] = stride;
1332 }
1333
1334 DEFCOMMAND(18, SetShader, int mode; int permutation; int exactspecularmath;)
1335 static void DPSOFTRAST_Interpret_SetShader(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_SetShader *command)
1336 {
1337         thread->shader_mode = command->mode;
1338         thread->shader_permutation = command->permutation;
1339         thread->shader_exactspecularmath = command->exactspecularmath;
1340 }
1341 void DPSOFTRAST_SetShader(int mode, int permutation, int exactspecularmath)
1342 {
1343         DPSOFTRAST_Command_SetShader *command = DPSOFTRAST_ALLOCATECOMMAND(SetShader);
1344         command->mode = mode;
1345         command->permutation = permutation;
1346         command->exactspecularmath = exactspecularmath;
1347
1348         dpsoftrast.shader_mode = mode;
1349         dpsoftrast.shader_permutation = permutation;
1350         dpsoftrast.shader_exactspecularmath = exactspecularmath;
1351 }
1352
1353 DEFCOMMAND(19, Uniform4f, DPSOFTRAST_UNIFORM index; float val[4];)
1354 static void DPSOFTRAST_Interpret_Uniform4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform4f *command)
1355 {
1356         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1357 }
1358 void DPSOFTRAST_Uniform4f(DPSOFTRAST_UNIFORM index, float v0, float v1, float v2, float v3)
1359 {
1360         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1361         command->index = index;
1362         command->val[0] = v0;
1363         command->val[1] = v1;
1364         command->val[2] = v2;
1365         command->val[3] = v3;
1366
1367         dpsoftrast.uniform4f[index*4+0] = v0;
1368         dpsoftrast.uniform4f[index*4+1] = v1;
1369         dpsoftrast.uniform4f[index*4+2] = v2;
1370         dpsoftrast.uniform4f[index*4+3] = v3;
1371 }
1372 void DPSOFTRAST_Uniform4fv(DPSOFTRAST_UNIFORM index, const float *v)
1373 {
1374         DPSOFTRAST_Command_Uniform4f *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform4f);
1375         command->index = index;
1376         memcpy(command->val, v, sizeof(command->val));
1377
1378         memcpy(&dpsoftrast.uniform4f[index*4], v, sizeof(float[4]));
1379 }
1380
1381 DEFCOMMAND(20, UniformMatrix4f, DPSOFTRAST_UNIFORM index; ALIGN(float val[16]);)
1382 static void DPSOFTRAST_Interpret_UniformMatrix4f(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_UniformMatrix4f *command)
1383 {
1384         memcpy(&thread->uniform4f[command->index*4], command->val, sizeof(command->val));
1385 }
1386 void DPSOFTRAST_UniformMatrix4fv(DPSOFTRAST_UNIFORM uniform, int arraysize, int transpose, const float *v)
1387 {
1388 #ifdef SSE_POSSIBLE
1389         int i, index;
1390         for (i = 0, index = (int)uniform;i < arraysize;i++, index += 4, v += 16)
1391         {
1392                 __m128 m0, m1, m2, m3;
1393                 DPSOFTRAST_Command_UniformMatrix4f *command = DPSOFTRAST_ALLOCATECOMMAND(UniformMatrix4f);
1394                 command->index = (DPSOFTRAST_UNIFORM)index;
1395                 if (((size_t)v)&(ALIGN_SIZE-1))
1396                 {
1397                         m0 = _mm_loadu_ps(v);
1398                         m1 = _mm_loadu_ps(v+4);
1399                         m2 = _mm_loadu_ps(v+8);
1400                         m3 = _mm_loadu_ps(v+12);
1401                 }
1402                 else
1403                 {
1404                         m0 = _mm_load_ps(v);
1405                         m1 = _mm_load_ps(v+4);
1406                         m2 = _mm_load_ps(v+8);
1407                         m3 = _mm_load_ps(v+12);
1408                 }
1409                 if (transpose)
1410                 {
1411                         __m128 t0, t1, t2, t3;
1412                         t0 = _mm_unpacklo_ps(m0, m1);
1413                         t1 = _mm_unpacklo_ps(m2, m3);
1414                         t2 = _mm_unpackhi_ps(m0, m1);
1415                         t3 = _mm_unpackhi_ps(m2, m3);
1416                         m0 = _mm_movelh_ps(t0, t1);
1417                         m1 = _mm_movehl_ps(t1, t0);
1418                         m2 = _mm_movelh_ps(t2, t3);
1419                         m3 = _mm_movehl_ps(t3, t2);                     
1420                 }
1421                 _mm_store_ps(command->val, m0);
1422                 _mm_store_ps(command->val+4, m1);
1423                 _mm_store_ps(command->val+8, m2);
1424                 _mm_store_ps(command->val+12, m3);
1425                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+0], m0);
1426                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+4], m1);
1427                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+8], m2);
1428                 _mm_store_ps(&dpsoftrast.uniform4f[index*4+12], m3);
1429         }
1430 #endif
1431 }
1432
1433 DEFCOMMAND(21, Uniform1i, DPSOFTRAST_UNIFORM index; int val;)
1434 static void DPSOFTRAST_Interpret_Uniform1i(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Uniform1i *command)
1435 {
1436         thread->uniform1i[command->index] = command->val;
1437 }
1438 void DPSOFTRAST_Uniform1i(DPSOFTRAST_UNIFORM index, int i0)
1439 {
1440         DPSOFTRAST_Command_Uniform1i *command = DPSOFTRAST_ALLOCATECOMMAND(Uniform1i);
1441         command->index = index;
1442         command->val = i0;
1443
1444         dpsoftrast.uniform1i[command->index] = i0;
1445 }
1446
1447 DEFCOMMAND(24, ClipPlane, float clipplane[4];)
1448 static void DPSOFTRAST_Interpret_ClipPlane(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_ClipPlane *command)
1449 {
1450         memcpy(thread->clipplane, command->clipplane, 4*sizeof(float));
1451         thread->validate |= DPSOFTRAST_VALIDATE_FB;
1452 }
1453 void DPSOFTRAST_ClipPlane(float x, float y, float z, float w)
1454 {
1455         DPSOFTRAST_Command_ClipPlane *command = DPSOFTRAST_ALLOCATECOMMAND(ClipPlane);
1456         command->clipplane[0] = x;
1457         command->clipplane[1] = y;
1458         command->clipplane[2] = z;
1459         command->clipplane[3] = w;
1460 }
1461
1462 #ifdef SSE_POSSIBLE
1463 static void DPSOFTRAST_Load4fTo4f(float *dst, const unsigned char *src, int size, int stride)
1464 {
1465         float *end = dst + size*4;
1466         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1)) // check for alignment
1467         {
1468                 while (dst < end)
1469                 {
1470                         _mm_store_ps(dst, _mm_loadu_ps((const float *)src));
1471                         dst += 4;
1472                         src += stride;
1473                 }
1474         }
1475         else
1476         {
1477                 while (dst < end)
1478                 {
1479                         _mm_store_ps(dst, _mm_load_ps((const float *)src));
1480                         dst += 4;
1481                         src += stride;
1482                 }
1483         }
1484 }
1485
1486 static void DPSOFTRAST_Load3fTo4f(float *dst, const unsigned char *src, int size, int stride)
1487 {
1488         float *end = dst + size*4;
1489         if (stride == sizeof(float[3]))
1490         {
1491                 float *end4 = dst + (size&~3)*4;        
1492                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1493                 {
1494                         while (dst < end4)
1495                         {
1496                                 __m128 v1 = _mm_loadu_ps((const float *)src), v2 = _mm_loadu_ps((const float *)src + 4), v3 = _mm_loadu_ps((const float *)src + 8), dv; 
1497                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1498                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1499                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1500                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1501                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1502                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1503                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1504                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1505                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1506                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1507                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1508                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1509                                 dst += 16;
1510                                 src += 4*sizeof(float[3]);
1511                         }
1512                 }
1513                 else
1514                 {
1515                         while (dst < end4)
1516                         {
1517                                 __m128 v1 = _mm_load_ps((const float *)src), v2 = _mm_load_ps((const float *)src + 4), v3 = _mm_load_ps((const float *)src + 8), dv;
1518                                 dv = _mm_shuffle_ps(v1, v1, _MM_SHUFFLE(2, 1, 0, 3));
1519                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1520                                 _mm_store_ps(dst, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1521                                 dv = _mm_shuffle_ps(v1, v2, _MM_SHUFFLE(1, 0, 3, 3));
1522                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1523                                 _mm_store_ps(dst + 4, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1524                                 dv = _mm_shuffle_ps(v2, v3, _MM_SHUFFLE(0, 0, 3, 2));
1525                                 dv = _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(2, 1, 0, 3));
1526                                 dv = _mm_move_ss(dv, _mm_set_ss(1.0f));
1527                                 _mm_store_ps(dst + 8, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1528                                 dv = _mm_move_ss(v3, _mm_set_ss(1.0f));
1529                                 _mm_store_ps(dst + 12, _mm_shuffle_ps(dv, dv, _MM_SHUFFLE(0, 3, 2, 1)));
1530                                 dst += 16;
1531                                 src += 4*sizeof(float[3]);
1532                         }
1533                 }
1534         }
1535         if ((((size_t)src)|stride)&(ALIGN_SIZE - 1))
1536         {
1537                 while (dst < end)
1538                 {
1539                         __m128 v = _mm_loadu_ps((const float *)src);
1540                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1541                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1542                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1543                         _mm_store_ps(dst, v);
1544                         dst += 4;
1545                         src += stride;
1546                 }
1547         }
1548         else
1549         {
1550                 while (dst < end)
1551                 {
1552                         __m128 v = _mm_load_ps((const float *)src);
1553                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 1, 0, 3));
1554                         v = _mm_move_ss(v, _mm_set_ss(1.0f));
1555                         v = _mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 3, 2, 1));
1556                         _mm_store_ps(dst, v);
1557                         dst += 4;
1558                         src += stride;
1559                 }
1560         }
1561 }
1562
1563 static void DPSOFTRAST_Load2fTo4f(float *dst, const unsigned char *src, int size, int stride)
1564 {
1565         float *end = dst + size*4;
1566         __m128 v2 = _mm_setr_ps(0.0f, 0.0f, 0.0f, 1.0f);
1567         if (stride == sizeof(float[2]))
1568         {
1569                 float *end2 = dst + (size&~1)*4;
1570                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1571                 {
1572                         while (dst < end2)
1573                         {
1574                                 __m128 v = _mm_loadu_ps((const float *)src);
1575                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1576                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1577                                 dst += 8;
1578                                 src += 2*sizeof(float[2]);
1579                         }
1580                 }
1581                 else
1582                 {
1583                         while (dst < end2)
1584                         {
1585                                 __m128 v = _mm_load_ps((const float *)src);
1586                                 _mm_store_ps(dst, _mm_shuffle_ps(v, v2, _MM_SHUFFLE(3, 2, 1, 0)));
1587                                 _mm_store_ps(dst + 4, _mm_movehl_ps(v2, v));
1588                                 dst += 8;
1589                                 src += 2*sizeof(float[2]);
1590                         }
1591                 }
1592         }
1593         while (dst < end)
1594         {
1595                 _mm_store_ps(dst, _mm_loadl_pi(v2, (__m64 *)src));
1596                 dst += 4;
1597                 src += stride;
1598         }
1599 }
1600
1601 static void DPSOFTRAST_Load4bTo4f(float *dst, const unsigned char *src, int size, int stride)
1602 {
1603         float *end = dst + size*4;
1604         __m128 scale = _mm_set1_ps(1.0f/255.0f);
1605         if (stride == sizeof(unsigned char[4]))
1606         {
1607                 float *end4 = dst + (size&~3)*4;
1608                 if (((size_t)src)&(ALIGN_SIZE - 1)) // check for alignment
1609                 {
1610                         while (dst < end4)
1611                         {
1612                                 __m128i v = _mm_loadu_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1613                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1614                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1615                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1616                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1617                                 dst += 16;
1618                                 src += 4*sizeof(unsigned char[4]);
1619                         }
1620                 }
1621                 else
1622                 {
1623                         while (dst < end4)
1624                         {
1625                                 __m128i v = _mm_load_si128((const __m128i *)src), v1 = _mm_unpacklo_epi8(v, _mm_setzero_si128()), v2 = _mm_unpackhi_epi8(v, _mm_setzero_si128());
1626                                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v1, _mm_setzero_si128())), scale));
1627                                 _mm_store_ps(dst + 4, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v1, _mm_setzero_si128())), scale));
1628                                 _mm_store_ps(dst + 8, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(v2, _mm_setzero_si128())), scale));
1629                                 _mm_store_ps(dst + 12, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpackhi_epi16(v2, _mm_setzero_si128())), scale));
1630                                 dst += 16;
1631                                 src += 4*sizeof(unsigned char[4]);
1632                         }
1633                 }
1634         }
1635         while (dst < end)
1636         {
1637                 __m128i v = _mm_cvtsi32_si128(*(const int *)src);
1638                 _mm_store_ps(dst, _mm_mul_ps(_mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_unpacklo_epi8(v, _mm_setzero_si128()), _mm_setzero_si128())), scale));
1639                 dst += 4;
1640                 src += stride;
1641         }
1642 }
1643
1644 static void DPSOFTRAST_Fill4f(float *dst, const float *src, int size)
1645 {
1646         float *end = dst + 4*size;
1647         __m128 v = _mm_loadu_ps(src);
1648         while (dst < end)
1649         {
1650                 _mm_store_ps(dst, v);
1651                 dst += 4;
1652         }
1653 }
1654 #endif
1655
1656 static void DPSOFTRAST_Vertex_Transform(float *out4f, const float *in4f, int numitems, const float *inmatrix16f)
1657 {
1658 #ifdef SSE_POSSIBLE
1659         static const float identitymatrix[4][4] = {{1,0,0,0},{0,1,0,0},{0,0,1,0},{0,0,0,1}};
1660         __m128 m0, m1, m2, m3;
1661         float *end;
1662         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1663         {
1664                 // fast case for identity matrix
1665                 if (out4f != in4f) memcpy(out4f, in4f, numitems * sizeof(float[4]));
1666                 return;
1667         }
1668         end = out4f + numitems*4;
1669         m0 = _mm_loadu_ps(inmatrix16f);
1670         m1 = _mm_loadu_ps(inmatrix16f + 4);
1671         m2 = _mm_loadu_ps(inmatrix16f + 8);
1672         m3 = _mm_loadu_ps(inmatrix16f + 12);
1673         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1674         {
1675                 while (out4f < end)
1676                 {
1677                         __m128 v = _mm_loadu_ps(in4f);
1678                         _mm_store_ps(out4f,
1679                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1680                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1681                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1682                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1683                         out4f += 4;
1684                         in4f += 4;
1685                 }
1686         }
1687         else
1688         {
1689                 while (out4f < end)
1690                 {
1691                         __m128 v = _mm_load_ps(in4f);
1692                         _mm_store_ps(out4f,
1693                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(0, 0, 0, 0)), m0),
1694                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(1, 1, 1, 1)), m1),
1695                                                         _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(2, 2, 2, 2)), m2),
1696                                                                                 _mm_mul_ps(_mm_shuffle_ps(v, v, _MM_SHUFFLE(3, 3, 3, 3)), m3)))));
1697                         out4f += 4;
1698                         in4f += 4;
1699                 }
1700         }
1701 #endif
1702 }
1703
1704 #if 0
1705 static void DPSOFTRAST_Vertex_Copy(float *out4f, const float *in4f, int numitems)
1706 {
1707         memcpy(out4f, in4f, numitems * sizeof(float[4]));
1708 }
1709 #endif
1710
1711 #ifdef SSE_POSSIBLE
1712 #define DPSOFTRAST_PROJECTVERTEX(out, in, viewportcenter, viewportscale) \
1713 { \
1714         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1715         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1716         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1717         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1718 }
1719
1720 #define DPSOFTRAST_PROJECTY(out, in, viewportcenter, viewportscale) \
1721 { \
1722         __m128 p = (in), w = _mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)); \
1723         p = _mm_move_ss(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 1, 0, 3)), _mm_set_ss(1.0f)); \
1724         p = _mm_add_ps(viewportcenter, _mm_div_ps(_mm_mul_ps(viewportscale, p), w)); \
1725         out = _mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 3, 2, 1)); \
1726 }
1727
1728 #define DPSOFTRAST_TRANSFORMVERTEX(out, in, m0, m1, m2, m3) \
1729 { \
1730         __m128 p = (in); \
1731         out = _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(0, 0, 0, 0)), m0), \
1732                                                   _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(1, 1, 1, 1)), m1), \
1733                                                                 _mm_add_ps(_mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(2, 2, 2, 2)), m2), \
1734                                                                                         _mm_mul_ps(_mm_shuffle_ps(p, p, _MM_SHUFFLE(3, 3, 3, 3)), m3)))); \
1735 }
1736
1737 static int DPSOFTRAST_Vertex_BoundY(int *starty, int *endy, const float *minposf, const float *maxposf, const float *inmatrix16f)
1738 {
1739         int clipmask = 0xFF;
1740         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1741         __m128 bb[8], clipdist[8], minproj = _mm_set_ss(2.0f), maxproj = _mm_set_ss(-2.0f);
1742         __m128 m0 = _mm_loadu_ps(inmatrix16f), m1 = _mm_loadu_ps(inmatrix16f + 4), m2 = _mm_loadu_ps(inmatrix16f + 8), m3 = _mm_loadu_ps(inmatrix16f + 12);
1743         __m128 minpos = _mm_load_ps(minposf), maxpos = _mm_load_ps(maxposf);
1744         m0 = _mm_shuffle_ps(m0, m0, _MM_SHUFFLE(3, 2, 0, 1));
1745         m1 = _mm_shuffle_ps(m1, m1, _MM_SHUFFLE(3, 2, 0, 1));
1746         m2 = _mm_shuffle_ps(m2, m2, _MM_SHUFFLE(3, 2, 0, 1));
1747         m3 = _mm_shuffle_ps(m3, m3, _MM_SHUFFLE(3, 2, 0, 1));
1748         #define BBFRONT(k, pos) \
1749         { \
1750                 DPSOFTRAST_TRANSFORMVERTEX(bb[k], pos, m0, m1, m2, m3); \
1751                 clipdist[k] = _mm_add_ss(_mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(2, 2, 2, 2)), _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1752                 if (_mm_ucomige_ss(clipdist[k], _mm_setzero_ps())) \
1753                 { \
1754                         __m128 proj; \
1755                         clipmask &= ~(1<<k); \
1756                         proj = _mm_div_ss(bb[k], _mm_shuffle_ps(bb[k], bb[k], _MM_SHUFFLE(3, 3, 3, 3))); \
1757                         minproj = _mm_min_ss(minproj, proj); \
1758                         maxproj = _mm_max_ss(maxproj, proj); \
1759                 } \
1760         }
1761         BBFRONT(0, minpos); 
1762         BBFRONT(1, _mm_move_ss(minpos, maxpos)); 
1763         BBFRONT(2, _mm_shuffle_ps(_mm_move_ss(maxpos, minpos), minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1764         BBFRONT(3, _mm_shuffle_ps(maxpos, minpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1765         BBFRONT(4, _mm_shuffle_ps(minpos, maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1766         BBFRONT(5, _mm_shuffle_ps(_mm_move_ss(minpos, maxpos), maxpos, _MM_SHUFFLE(3, 2, 1, 0))); 
1767         BBFRONT(6, _mm_move_ss(maxpos, minpos)); 
1768         BBFRONT(7, maxpos);
1769         #define BBCLIP(k) \
1770         { \
1771                 if (clipmask&(1<<k)) \
1772                 { \
1773                         if (!(clipmask&(1<<(k^1)))) \
1774                         { \
1775                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^1])); \
1776                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^1], bb[k]))); \
1777                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1778                                 minproj = _mm_min_ss(minproj, proj); \
1779                                 maxproj = _mm_max_ss(maxproj, proj); \
1780                         } \
1781                         if (!(clipmask&(1<<(k^2)))) \
1782                         { \
1783                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^2])); \
1784                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^2], bb[k]))); \
1785                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1786                                 minproj = _mm_min_ss(minproj, proj); \
1787                                 maxproj = _mm_max_ss(maxproj, proj); \
1788                         } \
1789                         if (!(clipmask&(1<<(k^4)))) \
1790                         { \
1791                                 __m128 frac = _mm_div_ss(clipdist[k], _mm_sub_ss(clipdist[k], clipdist[k^4])); \
1792                                 __m128 proj = _mm_add_ps(bb[k], _mm_mul_ps(_mm_shuffle_ps(frac, frac, _MM_SHUFFLE(0, 0, 0, 0)), _mm_sub_ps(bb[k^4], bb[k]))); \
1793                                 proj = _mm_div_ss(proj, _mm_shuffle_ps(proj, proj, _MM_SHUFFLE(3, 3, 3, 3))); \
1794                                 minproj = _mm_min_ss(minproj, proj); \
1795                                 maxproj = _mm_max_ss(maxproj, proj); \
1796                         } \
1797                 } \
1798         }
1799         BBCLIP(0); BBCLIP(1); BBCLIP(2); BBCLIP(3); BBCLIP(4); BBCLIP(5); BBCLIP(6); BBCLIP(7);
1800         viewportcenter = _mm_shuffle_ps(viewportcenter, viewportcenter, _MM_SHUFFLE(0, 3, 1, 2));
1801         viewportscale = _mm_shuffle_ps(viewportscale, viewportscale, _MM_SHUFFLE(0, 3, 1, 2));
1802         minproj = _mm_max_ss(minproj, _mm_set_ss(-2.0f));
1803         maxproj = _mm_min_ss(maxproj, _mm_set_ss(2.0f));
1804         minproj = _mm_add_ss(viewportcenter, _mm_mul_ss(minproj, viewportscale));
1805         maxproj = _mm_add_ss(viewportcenter, _mm_mul_ss(maxproj, viewportscale));
1806         *starty = _mm_cvttss_si32(maxproj);
1807         *endy = _mm_cvttss_si32(minproj)+1;
1808         return clipmask;
1809 }
1810         
1811 static int DPSOFTRAST_Vertex_Project(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems)
1812 {
1813         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1814         float *end = out4f + numitems*4;
1815         __m128 viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter), viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1816         __m128 minpos, maxpos;
1817         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1818         {
1819                 minpos = maxpos = _mm_loadu_ps(in4f);
1820                 while (out4f < end)
1821                 {
1822                         __m128 v = _mm_loadu_ps(in4f);
1823                         minpos = _mm_min_ps(minpos, v);
1824                         maxpos = _mm_max_ps(maxpos, v);
1825                         _mm_store_ps(out4f, v);
1826                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1827                         _mm_store_ps(screen4f, v);
1828                         in4f += 4;
1829                         out4f += 4;
1830                         screen4f += 4;
1831                 }
1832         }
1833         else
1834         {
1835                 minpos = maxpos = _mm_load_ps(in4f);
1836                 while (out4f < end)
1837                 {
1838                         __m128 v = _mm_load_ps(in4f);
1839                         minpos = _mm_min_ps(minpos, v);
1840                         maxpos = _mm_max_ps(maxpos, v);
1841                         _mm_store_ps(out4f, v);
1842                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1843                         _mm_store_ps(screen4f, v);
1844                         in4f += 4;
1845                         out4f += 4;
1846                         screen4f += 4;
1847                 }
1848         }
1849         if (starty && endy) 
1850         {
1851                 ALIGN(float minposf[4]);
1852                 ALIGN(float maxposf[4]);
1853                 _mm_store_ps(minposf, minpos);
1854                 _mm_store_ps(maxposf, maxpos);
1855                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, identitymatrix);
1856         }
1857         return 0;
1858 }
1859
1860 static int DPSOFTRAST_Vertex_TransformProject(float *out4f, float *screen4f, int *starty, int *endy, const float *in4f, int numitems, const float *inmatrix16f)
1861 {
1862         static const float identitymatrix[16] = {1,0,0,0, 0,1,0,0, 0,0,1,0, 0,0,0,1};
1863         __m128 m0, m1, m2, m3, viewportcenter, viewportscale, minpos, maxpos;
1864         float *end;
1865         if (!memcmp(identitymatrix, inmatrix16f, sizeof(float[16])))
1866                 return DPSOFTRAST_Vertex_Project(out4f, screen4f, starty, endy, in4f, numitems);
1867         end = out4f + numitems*4;
1868         viewportcenter = _mm_load_ps(dpsoftrast.fb_viewportcenter);
1869         viewportscale = _mm_load_ps(dpsoftrast.fb_viewportscale);
1870         m0 = _mm_loadu_ps(inmatrix16f);
1871         m1 = _mm_loadu_ps(inmatrix16f + 4);
1872         m2 = _mm_loadu_ps(inmatrix16f + 8);
1873         m3 = _mm_loadu_ps(inmatrix16f + 12);
1874         if (((size_t)in4f)&(ALIGN_SIZE-1)) // check alignment
1875         {
1876                 minpos = maxpos = _mm_loadu_ps(in4f);
1877                 while (out4f < end)
1878                 {
1879                         __m128 v = _mm_loadu_ps(in4f);
1880                         minpos = _mm_min_ps(minpos, v);
1881                         maxpos = _mm_max_ps(maxpos, v);
1882                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1883                         _mm_store_ps(out4f, v);
1884                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1885                         _mm_store_ps(screen4f, v);
1886                         in4f += 4;
1887                         out4f += 4;
1888                         screen4f += 4;
1889                 }
1890         }
1891         else
1892         {
1893                 minpos = maxpos = _mm_load_ps(in4f);
1894                 while (out4f < end)
1895                 {
1896                         __m128 v = _mm_load_ps(in4f);
1897                         minpos = _mm_min_ps(minpos, v);
1898                         maxpos = _mm_max_ps(maxpos, v);
1899                         DPSOFTRAST_TRANSFORMVERTEX(v, v, m0, m1, m2, m3);
1900                         _mm_store_ps(out4f, v);
1901                         DPSOFTRAST_PROJECTVERTEX(v, v, viewportcenter, viewportscale);
1902                         _mm_store_ps(screen4f, v);
1903                         in4f += 4;
1904                         out4f += 4;
1905                         screen4f += 4;
1906                 }
1907         }
1908         if (starty && endy) 
1909         {
1910                 ALIGN(float minposf[4]);
1911                 ALIGN(float maxposf[4]);
1912                 _mm_store_ps(minposf, minpos);
1913                 _mm_store_ps(maxposf, maxpos);
1914                 return DPSOFTRAST_Vertex_BoundY(starty, endy, minposf, maxposf, inmatrix16f); 
1915         }
1916         return 0;
1917 }
1918 #endif
1919
1920 static float *DPSOFTRAST_Array_Load(int outarray, int inarray)
1921 {
1922 #ifdef SSE_POSSIBLE
1923         float *outf = dpsoftrast.post_array4f[outarray];
1924         const unsigned char *inb;
1925         int firstvertex = dpsoftrast.firstvertex;
1926         int numvertices = dpsoftrast.numvertices;
1927         int stride;
1928         switch(inarray)
1929         {
1930         case DPSOFTRAST_ARRAY_POSITION:
1931                 stride = dpsoftrast.stride_vertex;
1932                 inb = (unsigned char *)dpsoftrast.pointer_vertex3f + firstvertex * stride;
1933                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1934                 break;
1935         case DPSOFTRAST_ARRAY_COLOR:
1936                 stride = dpsoftrast.stride_color;
1937                 if (dpsoftrast.pointer_color4f)
1938                 {
1939                         inb = (const unsigned char *)dpsoftrast.pointer_color4f + firstvertex * stride;
1940                         DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1941                 }
1942                 else if (dpsoftrast.pointer_color4ub)
1943                 {
1944                         stride = dpsoftrast.stride_color;
1945                         inb = (const unsigned char *)dpsoftrast.pointer_color4ub + firstvertex * stride;
1946                         DPSOFTRAST_Load4bTo4f(outf, inb, numvertices, stride);
1947                 }
1948                 else
1949                 {
1950                         DPSOFTRAST_Fill4f(outf, dpsoftrast.color, numvertices);
1951                 }
1952                 break;
1953         default:
1954                 stride = dpsoftrast.stride_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0];
1955                 if (dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1956                 {
1957                         inb = (const unsigned char *)dpsoftrast.pointer_texcoordf[inarray-DPSOFTRAST_ARRAY_TEXCOORD0] + firstvertex * stride;
1958                         switch(dpsoftrast.components_texcoord[inarray-DPSOFTRAST_ARRAY_TEXCOORD0])
1959                         {
1960                         case 2:
1961                                 DPSOFTRAST_Load2fTo4f(outf, inb, numvertices, stride);
1962                                 break;
1963                         case 3:
1964                                 DPSOFTRAST_Load3fTo4f(outf, inb, numvertices, stride);
1965                                 break;
1966                         case 4:
1967                                 DPSOFTRAST_Load4fTo4f(outf, inb, numvertices, stride);
1968                                 break;
1969                         }
1970                 }
1971                 break;
1972         }
1973         return outf;
1974 #else
1975         return NULL;
1976 #endif
1977 }
1978
1979 static float *DPSOFTRAST_Array_Transform(int outarray, int inarray, const float *inmatrix16f)
1980 {
1981         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1982         DPSOFTRAST_Vertex_Transform(data, data, dpsoftrast.numvertices, inmatrix16f);
1983         return data;
1984 }
1985
1986 #if 0
1987 static float *DPSOFTRAST_Array_Project(int outarray, int inarray)
1988 {
1989 #ifdef SSE_POSSIBLE
1990         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
1991         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_Project(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices);
1992         return data;
1993 #else
1994         return NULL;
1995 #endif
1996 }
1997 #endif
1998
1999 static float *DPSOFTRAST_Array_TransformProject(int outarray, int inarray, const float *inmatrix16f)
2000 {
2001 #ifdef SSE_POSSIBLE
2002         float *data = inarray >= 0 ? DPSOFTRAST_Array_Load(outarray, inarray) : dpsoftrast.post_array4f[outarray];
2003         dpsoftrast.drawclipped = DPSOFTRAST_Vertex_TransformProject(data, dpsoftrast.screencoord4f, &dpsoftrast.drawstarty, &dpsoftrast.drawendy, data, dpsoftrast.numvertices, inmatrix16f);
2004         return data;
2005 #else
2006         return NULL;
2007 #endif
2008 }
2009
2010 static void DPSOFTRAST_Draw_Span_Begin(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *zf)
2011 {
2012         int x;
2013         int startx = span->startx;
2014         int endx = span->endx;
2015         float wslope = triangle->w[0];
2016         float w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
2017         float endz = 1.0f / (w + wslope * startx);
2018         if (triangle->w[0] == 0)
2019         {
2020                 // LordHavoc: fast flat polygons (HUD/menu)
2021                 for (x = startx;x < endx;x++)
2022                         zf[x] = endz;
2023                 return;
2024         }
2025         for (x = startx;x < endx;)
2026         {
2027                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2028                 float z = endz, dz;
2029                 if (nextsub >= endx) nextsub = endsub = endx-1;
2030                 endz = 1.0f / (w + wslope * nextsub);
2031                 dz = x < nextsub ? (endz - z) / (nextsub - x) : 0.0f;
2032                 for (; x <= endsub; x++, z += dz)
2033                         zf[x] = z;
2034         }
2035 }
2036
2037 static void DPSOFTRAST_Draw_Span_FinishBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, const unsigned char* RESTRICT in4ub)
2038 {
2039 #ifdef SSE_POSSIBLE
2040         int x;
2041         int startx = span->startx;
2042         int endx = span->endx;
2043         int maskx;
2044         int subx;
2045         const unsigned int * RESTRICT ini = (const unsigned int *)in4ub;
2046         unsigned char * RESTRICT pixelmask = span->pixelmask;
2047         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0];
2048         unsigned int * RESTRICT pixeli = (unsigned int *)dpsoftrast.fb_colorpixels[0];
2049         if (!pixel)
2050                 return;
2051         pixel += (span->y * dpsoftrast.fb_width + span->x) * 4;
2052         pixeli += span->y * dpsoftrast.fb_width + span->x;
2053         // handle alphatest now (this affects depth writes too)
2054         if (thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL)
2055                 for (x = startx;x < endx;x++)
2056                         if (in4ub[x*4+3] < 128)
2057                                 pixelmask[x] = false;
2058         // LordHavoc: clear pixelmask for some pixels in alphablend cases, this
2059         // helps sprites, text and hud artwork
2060         switch(thread->fb_blendmode)
2061         {
2062         case DPSOFTRAST_BLENDMODE_ALPHA:
2063         case DPSOFTRAST_BLENDMODE_ADDALPHA:
2064         case DPSOFTRAST_BLENDMODE_SUBALPHA:
2065                 maskx = startx;
2066                 for (x = startx;x < endx;x++)
2067                 {
2068                         if (in4ub[x*4+3] >= 1)
2069                         {
2070                                 startx = x;
2071                                 for (;;)
2072                                 {
2073                                         while (++x < endx && in4ub[x*4+3] >= 1) ;
2074                                         maskx = x;
2075                                         if (x >= endx) break;
2076                                         ++x;
2077                                         while (++x < endx && in4ub[x*4+3] < 1) pixelmask[x] = false;
2078                                         if (x >= endx) break;
2079                                 }
2080                                 break;
2081                         }
2082                 }
2083                 endx = maskx;
2084                 break;
2085         case DPSOFTRAST_BLENDMODE_OPAQUE:
2086         case DPSOFTRAST_BLENDMODE_ADD:
2087         case DPSOFTRAST_BLENDMODE_INVMOD:
2088         case DPSOFTRAST_BLENDMODE_MUL:
2089         case DPSOFTRAST_BLENDMODE_MUL2:
2090         case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2091         case DPSOFTRAST_BLENDMODE_INVADD:
2092                 break;
2093         }
2094         // put some special values at the end of the mask to ensure the loops end
2095         pixelmask[endx] = 1;
2096         pixelmask[endx+1] = 0;
2097         // LordHavoc: use a double loop to identify subspans, this helps the
2098         // optimized copy/blend loops to perform at their best, most triangles
2099         // have only one run of pixels, and do the search using wide reads...
2100         x = startx;
2101         while (x < endx)
2102         {
2103                 // if this pixel is masked off, it's probably not alone...
2104                 if (!pixelmask[x])
2105                 {
2106                         x++;
2107 #if 1
2108                         if (x + 8 < endx)
2109                         {
2110                                 // the 4-item search must be aligned or else it stalls badly
2111                                 if ((x & 3) && !pixelmask[x]) 
2112                                 {
2113                                         if(pixelmask[x]) goto endmasked;
2114                                         x++;
2115                                         if (x & 3)
2116                                         {
2117                                                 if(pixelmask[x]) goto endmasked;
2118                                                 x++;
2119                                                 if (x & 3)
2120                                                 {
2121                                                         if(pixelmask[x]) goto endmasked;
2122                                                         x++;
2123                                                 }
2124                                         }
2125                                 }
2126                                 while (*(unsigned int *)&pixelmask[x] == 0x00000000)
2127                                         x += 4;
2128                         }
2129 #endif
2130                         for (;!pixelmask[x];x++)
2131                                 ;
2132                         // rather than continue the loop, just check the end variable
2133                         if (x >= endx)
2134                                 break;
2135                 }
2136         endmasked:
2137                 // find length of subspan
2138                 subx = x + 1;
2139 #if 1
2140                 if (subx + 8 < endx)
2141                 {
2142                         if (subx & 3)
2143                         {
2144                                 if(!pixelmask[subx]) goto endunmasked;
2145                                 subx++;
2146                                 if (subx & 3)
2147                                 {
2148                                         if(!pixelmask[subx]) goto endunmasked;
2149                                         subx++;
2150                                         if (subx & 3)
2151                                         {
2152                                                 if(!pixelmask[subx]) goto endunmasked;
2153                                                 subx++;
2154                                         }
2155                                 }
2156                         }
2157                         while (*(unsigned int *)&pixelmask[subx] == 0x01010101)
2158                                 subx += 4;
2159                 }
2160 #endif
2161                 for (;pixelmask[subx];subx++)
2162                         ;
2163                 // the checks can overshoot, so make sure to clip it...
2164                 if (subx > endx)
2165                         subx = endx;
2166         endunmasked:
2167                 // now that we know the subspan length...  process!
2168                 switch(thread->fb_blendmode)
2169                 {
2170                 case DPSOFTRAST_BLENDMODE_OPAQUE:
2171 #if 0
2172                         if (subx - x >= 16)
2173                         {
2174                                 memcpy(pixeli + x, ini + x, (subx - x) * sizeof(pixeli[x]));
2175                                 x = subx;
2176                         }
2177                         else
2178 #elif 1
2179                         while (x + 16 <= subx)
2180                         {
2181                                 _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2182                                 _mm_storeu_si128((__m128i *)&pixeli[x+4], _mm_loadu_si128((const __m128i *)&ini[x+4]));
2183                                 _mm_storeu_si128((__m128i *)&pixeli[x+8], _mm_loadu_si128((const __m128i *)&ini[x+8]));
2184                                 _mm_storeu_si128((__m128i *)&pixeli[x+12], _mm_loadu_si128((const __m128i *)&ini[x+12]));
2185                                 x += 16;
2186                         }
2187 #endif
2188                         {
2189                                 while (x + 4 <= subx)
2190                                 {
2191                                         _mm_storeu_si128((__m128i *)&pixeli[x], _mm_loadu_si128((const __m128i *)&ini[x]));
2192                                         x += 4;
2193                                 }
2194                                 if (x + 2 <= subx)
2195                                 {
2196                                         pixeli[x] = ini[x];
2197                                         pixeli[x+1] = ini[x+1];
2198                                         x += 2;
2199                                 }
2200                                 if (x < subx)
2201                                 {
2202                                         pixeli[x] = ini[x];
2203                                         x++;
2204                                 }
2205                         }
2206                         break;
2207                 case DPSOFTRAST_BLENDMODE_ALPHA:
2208                 #define FINISHBLEND(blend2, blend1) \
2209                         for (;x + 1 < subx;x += 2) \
2210                         { \
2211                                 __m128i src, dst; \
2212                                 src = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ini[x]), _mm_setzero_si128()); \
2213                                 dst = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&pixeli[x]), _mm_setzero_si128()); \
2214                                 blend2; \
2215                                 _mm_storel_epi64((__m128i *)&pixeli[x], _mm_packus_epi16(dst, dst)); \
2216                         } \
2217                         if (x < subx) \
2218                         { \
2219                                 __m128i src, dst; \
2220                                 src = _mm_unpacklo_epi8(_mm_cvtsi32_si128(ini[x]), _mm_setzero_si128()); \
2221                                 dst = _mm_unpacklo_epi8(_mm_cvtsi32_si128(pixeli[x]), _mm_setzero_si128()); \
2222                                 blend1; \
2223                                 pixeli[x] = _mm_cvtsi128_si32(_mm_packus_epi16(dst, dst)); \
2224                                 x++; \
2225                         }
2226                         FINISHBLEND({
2227                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2228                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2229                         }, {
2230                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2231                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(src, dst), 4), _mm_slli_epi16(blend, 4)));
2232                         });
2233                         break;
2234                 case DPSOFTRAST_BLENDMODE_ADDALPHA:
2235                         FINISHBLEND({
2236                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2237                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2238                         }, {
2239                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2240                                 dst = _mm_add_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2241                         });
2242                         break;
2243                 case DPSOFTRAST_BLENDMODE_ADD:
2244                         FINISHBLEND({ dst = _mm_add_epi16(src, dst); }, { dst = _mm_add_epi16(src, dst); });
2245                         break;
2246                 case DPSOFTRAST_BLENDMODE_INVMOD:
2247                         FINISHBLEND({
2248                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2249                         }, {
2250                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, src), 8));
2251                         });
2252                         break;
2253                 case DPSOFTRAST_BLENDMODE_MUL:
2254                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 8); });
2255                         break;
2256                 case DPSOFTRAST_BLENDMODE_MUL2:
2257                         FINISHBLEND({ dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); }, { dst = _mm_srli_epi16(_mm_mullo_epi16(src, dst), 7); });
2258                         break;
2259                 case DPSOFTRAST_BLENDMODE_SUBALPHA:
2260                         FINISHBLEND({
2261                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2262                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2263                         }, {
2264                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2265                                 dst = _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(src, blend), 8));
2266                         });
2267                         break;
2268                 case DPSOFTRAST_BLENDMODE_PSEUDOALPHA:
2269                         FINISHBLEND({
2270                                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
2271                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2272                         }, {
2273                                 __m128i blend = _mm_shufflelo_epi16(src, _MM_SHUFFLE(3, 3, 3, 3));
2274                                 dst = _mm_add_epi16(src, _mm_sub_epi16(dst, _mm_srli_epi16(_mm_mullo_epi16(dst, blend), 8)));
2275                         });
2276                         break;
2277                 case DPSOFTRAST_BLENDMODE_INVADD:
2278                         FINISHBLEND({
2279                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2280                         }, {
2281                                 dst = _mm_add_epi16(dst, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(_mm_set1_epi16(255), dst), 4), _mm_slli_epi16(src, 4)));
2282                         });
2283                         break;
2284                 }
2285         }
2286 #endif
2287 }
2288
2289 static void DPSOFTRAST_Texture2DBGRA8(DPSOFTRAST_Texture *texture, int mip, float x, float y, unsigned char c[4])
2290         // warning: this is SLOW, only use if the optimized per-span functions won't do
2291 {
2292         const unsigned char * RESTRICT pixelbase;
2293         const unsigned char * RESTRICT pixel[4];
2294         int width = texture->mipmap[mip][2], height = texture->mipmap[mip][3];
2295         int wrapmask[2] = { width-1, height-1 };
2296         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2297         if(texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR)
2298         {
2299                 unsigned int tc[2] = { x * (width<<12) - 2048, y * (height<<12) - 2048};
2300                 unsigned int frac[2] = { tc[0]&0xFFF, tc[1]&0xFFF };
2301                 unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2302                 unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2303                 int tci[2] = { tc[0]>>12, tc[1]>>12 };
2304                 int tci1[2] = { tci[0] + 1, tci[1] + 1 };
2305                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2306                 {
2307                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2308                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2309                         tci1[0] = tci1[0] >= 0 ? (tci1[0] <= wrapmask[0] ? tci1[0] : wrapmask[0]) : 0;
2310                         tci1[1] = tci1[1] >= 0 ? (tci1[1] <= wrapmask[1] ? tci1[1] : wrapmask[1]) : 0;
2311                 }
2312                 else
2313                 {
2314                         tci[0] &= wrapmask[0];
2315                         tci[1] &= wrapmask[1];
2316                         tci1[0] &= wrapmask[0];
2317                         tci1[1] &= wrapmask[1];
2318                 }
2319                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2320                 pixel[1] = pixelbase + 4 * (tci[1]*width+tci1[0]);
2321                 pixel[2] = pixelbase + 4 * (tci1[1]*width+tci[0]);
2322                 pixel[3] = pixelbase + 4 * (tci1[1]*width+tci1[0]);
2323                 c[0] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3])>>24;
2324                 c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3])>>24;
2325                 c[2] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3])>>24;
2326                 c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3])>>24;
2327         }
2328         else
2329         {
2330                 int tci[2] = { x * width, y * height };
2331                 if (texture->flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2332                 {
2333                         tci[0] = tci[0] >= 0 ? (tci[0] <= wrapmask[0] ? tci[0] : wrapmask[0]) : 0;
2334                         tci[1] = tci[1] >= 0 ? (tci[1] <= wrapmask[1] ? tci[1] : wrapmask[1]) : 0;
2335                 }
2336                 else
2337                 {
2338                         tci[0] &= wrapmask[0];
2339                         tci[1] &= wrapmask[1];
2340                 }
2341                 pixel[0] = pixelbase + 4 * (tci[1]*width+tci[0]);
2342                 c[0] = pixel[0][0];
2343                 c[1] = pixel[0][1];
2344                 c[2] = pixel[0][2];
2345                 c[3] = pixel[0][3];
2346         }
2347 }
2348
2349 #if 0
2350 static void DPSOFTRAST_Draw_Span_Texture2DVarying(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float * RESTRICT out4f, int texunitindex, int arrayindex, const float * RESTRICT zf)
2351 {
2352         int x;
2353         int startx = span->startx;
2354         int endx = span->endx;
2355         int flags;
2356         float c[4];
2357         float data[4];
2358         float slope[4];
2359         float tc[2], endtc[2];
2360         float tcscale[2];
2361         unsigned int tci[2];
2362         unsigned int tci1[2];
2363         unsigned int tcimin[2];
2364         unsigned int tcimax[2];
2365         int tciwrapmask[2];
2366         int tciwidth;
2367         int filter;
2368         int mip;
2369         const unsigned char * RESTRICT pixelbase;
2370         const unsigned char * RESTRICT pixel[4];
2371         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2372         // if no texture is bound, just fill it with white
2373         if (!texture)
2374         {
2375                 for (x = startx;x < endx;x++)
2376                 {
2377                         out4f[x*4+0] = 1.0f;
2378                         out4f[x*4+1] = 1.0f;
2379                         out4f[x*4+2] = 1.0f;
2380                         out4f[x*4+3] = 1.0f;
2381                 }
2382                 return;
2383         }
2384         mip = triangle->mip[texunitindex];
2385         pixelbase = (unsigned char *)texture->bytes + texture->mipmap[mip][0];
2386         // if this mipmap of the texture is 1 pixel, just fill it with that color
2387         if (texture->mipmap[mip][1] == 4)
2388         {
2389                 c[0] = texture->bytes[2] * (1.0f/255.0f);
2390                 c[1] = texture->bytes[1] * (1.0f/255.0f);
2391                 c[2] = texture->bytes[0] * (1.0f/255.0f);
2392                 c[3] = texture->bytes[3] * (1.0f/255.0f);
2393                 for (x = startx;x < endx;x++)
2394                 {
2395                         out4f[x*4+0] = c[0];
2396                         out4f[x*4+1] = c[1];
2397                         out4f[x*4+2] = c[2];
2398                         out4f[x*4+3] = c[3];
2399                 }
2400                 return;
2401         }
2402         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2403         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2404         flags = texture->flags;
2405         tcscale[0] = texture->mipmap[mip][2];
2406         tcscale[1] = texture->mipmap[mip][3];
2407         tciwidth = texture->mipmap[mip][2];
2408         tcimin[0] = 0;
2409         tcimin[1] = 0;
2410         tcimax[0] = texture->mipmap[mip][2]-1;
2411         tcimax[1] = texture->mipmap[mip][3]-1;
2412         tciwrapmask[0] = texture->mipmap[mip][2]-1;
2413         tciwrapmask[1] = texture->mipmap[mip][3]-1;
2414         endtc[0] = (data[0] + slope[0]*startx) * zf[startx] * tcscale[0];
2415         endtc[1] = (data[1] + slope[1]*startx) * zf[startx] * tcscale[1];
2416         if (filter)
2417         {
2418                 endtc[0] -= 0.5f;
2419                 endtc[1] -= 0.5f;
2420         }
2421         for (x = startx;x < endx;)
2422         {
2423                 unsigned int subtc[2];
2424                 unsigned int substep[2];
2425                 float subscale = 4096.0f/DPSOFTRAST_DRAW_MAXSUBSPAN;
2426                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2427                 if (nextsub >= endx)
2428                 {
2429                         nextsub = endsub = endx-1;      
2430                         if (x < nextsub) subscale = 4096.0f / (nextsub - x);
2431                 }
2432                 tc[0] = endtc[0];
2433                 tc[1] = endtc[1];
2434                 endtc[0] = (data[0] + slope[0]*nextsub) * zf[nextsub] * tcscale[0];
2435                 endtc[1] = (data[1] + slope[1]*nextsub) * zf[nextsub] * tcscale[1];
2436                 if (filter)
2437                 {
2438                         endtc[0] -= 0.5f;
2439                         endtc[1] -= 0.5f;
2440                 }
2441                 substep[0] = (endtc[0] - tc[0]) * subscale;
2442                 substep[1] = (endtc[1] - tc[1]) * subscale;
2443                 subtc[0] = tc[0] * (1<<12);
2444                 subtc[1] = tc[1] * (1<<12);
2445                 if (filter)
2446                 {
2447                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2448                         {
2449                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2450                                 {
2451                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2452                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2453                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2454                                         tci[0] = subtc[0]>>12;
2455                                         tci[1] = subtc[1]>>12;
2456                                         tci1[0] = tci[0] + 1;
2457                                         tci1[1] = tci[1] + 1;
2458                                         tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2459                                         tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2460                                         tci1[0] = tci1[0] >= tcimin[0] ? (tci1[0] <= tcimax[0] ? tci1[0] : tcimax[0]) : tcimin[0];
2461                                         tci1[1] = tci1[1] >= tcimin[1] ? (tci1[1] <= tcimax[1] ? tci1[1] : tcimax[1]) : tcimin[1];
2462                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2463                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2464                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2465                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2466                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2467                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2468                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2469                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2470                                         out4f[x*4+0] = c[0];
2471                                         out4f[x*4+1] = c[1];
2472                                         out4f[x*4+2] = c[2];
2473                                         out4f[x*4+3] = c[3];
2474                                 }
2475                         }
2476                         else
2477                         {
2478                                 for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2479                                 {
2480                                         unsigned int frac[2] = { subtc[0]&0xFFF, subtc[1]&0xFFF };
2481                                         unsigned int ifrac[2] = { 0x1000 - frac[0], 0x1000 - frac[1] };
2482                                         unsigned int lerp[4] = { ifrac[0]*ifrac[1], frac[0]*ifrac[1], ifrac[0]*frac[1], frac[0]*frac[1] };
2483                                         tci[0] = subtc[0]>>12;
2484                                         tci[1] = subtc[1]>>12;
2485                                         tci1[0] = tci[0] + 1;
2486                                         tci1[1] = tci[1] + 1;
2487                                         tci[0] &= tciwrapmask[0];
2488                                         tci[1] &= tciwrapmask[1];
2489                                         tci1[0] &= tciwrapmask[0];
2490                                         tci1[1] &= tciwrapmask[1];
2491                                         pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2492                                         pixel[1] = pixelbase + 4 * (tci[1]*tciwidth+tci1[0]);
2493                                         pixel[2] = pixelbase + 4 * (tci1[1]*tciwidth+tci[0]);
2494                                         pixel[3] = pixelbase + 4 * (tci1[1]*tciwidth+tci1[0]);
2495                                         c[0] = (pixel[0][2]*lerp[0]+pixel[1][2]*lerp[1]+pixel[2][2]*lerp[2]+pixel[3][2]*lerp[3]) * (1.0f / 0xFF000000);
2496                                         c[1] = (pixel[0][1]*lerp[0]+pixel[1][1]*lerp[1]+pixel[2][1]*lerp[2]+pixel[3][1]*lerp[3]) * (1.0f / 0xFF000000);
2497                                         c[2] = (pixel[0][0]*lerp[0]+pixel[1][0]*lerp[1]+pixel[2][0]*lerp[2]+pixel[3][0]*lerp[3]) * (1.0f / 0xFF000000);
2498                                         c[3] = (pixel[0][3]*lerp[0]+pixel[1][3]*lerp[1]+pixel[2][3]*lerp[2]+pixel[3][3]*lerp[3]) * (1.0f / 0xFF000000);
2499                                         out4f[x*4+0] = c[0];
2500                                         out4f[x*4+1] = c[1];
2501                                         out4f[x*4+2] = c[2];
2502                                         out4f[x*4+3] = c[3];
2503                                 }
2504                         }
2505                 }
2506                 else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2507                 {
2508                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2509                         {
2510                                 tci[0] = subtc[0]>>12;
2511                                 tci[1] = subtc[1]>>12;
2512                                 tci[0] = tci[0] >= tcimin[0] ? (tci[0] <= tcimax[0] ? tci[0] : tcimax[0]) : tcimin[0];
2513                                 tci[1] = tci[1] >= tcimin[1] ? (tci[1] <= tcimax[1] ? tci[1] : tcimax[1]) : tcimin[1];
2514                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2515                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2516                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2517                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2518                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2519                                 out4f[x*4+0] = c[0];
2520                                 out4f[x*4+1] = c[1];
2521                                 out4f[x*4+2] = c[2];
2522                                 out4f[x*4+3] = c[3];
2523                         }
2524                 }
2525                 else
2526                 {
2527                         for (; x <= endsub; x++, subtc[0] += substep[0], subtc[1] += substep[1])
2528                         {
2529                                 tci[0] = subtc[0]>>12;
2530                                 tci[1] = subtc[1]>>12;
2531                                 tci[0] &= tciwrapmask[0];
2532                                 tci[1] &= tciwrapmask[1];
2533                                 pixel[0] = pixelbase + 4 * (tci[1]*tciwidth+tci[0]);
2534                                 c[0] = pixel[0][2] * (1.0f / 255.0f);
2535                                 c[1] = pixel[0][1] * (1.0f / 255.0f);
2536                                 c[2] = pixel[0][0] * (1.0f / 255.0f);
2537                                 c[3] = pixel[0][3] * (1.0f / 255.0f);
2538                                 out4f[x*4+0] = c[0];
2539                                 out4f[x*4+1] = c[1];
2540                                 out4f[x*4+2] = c[2];
2541                                 out4f[x*4+3] = c[3];
2542                         }
2543                 }
2544         }
2545 }
2546 #endif
2547
2548 static void DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2549 {
2550 #ifdef SSE_POSSIBLE
2551         int x;
2552         int startx = span->startx;
2553         int endx = span->endx;
2554         int flags;
2555         __m128 data, slope, tcscale;
2556         __m128i tcsize, tcmask, tcoffset, tcmax;
2557         __m128 tc, endtc;
2558         __m128i subtc, substep, endsubtc;
2559         int filter;
2560         int mip;
2561         int affine; // LordHavoc: optimized affine texturing case
2562         unsigned int * RESTRICT outi = (unsigned int *)out4ub;
2563         const unsigned char * RESTRICT pixelbase;
2564         DPSOFTRAST_Texture *texture = thread->texbound[texunitindex];
2565         // if no texture is bound, just fill it with white
2566         if (!texture)
2567         {
2568                 memset(out4ub + startx*4, 255, (span->endx - span->startx)*4);
2569                 return;
2570         }
2571         mip = triangle->mip[texunitindex];
2572         pixelbase = (const unsigned char *)texture->bytes + texture->mipmap[mip][0];
2573         // if this mipmap of the texture is 1 pixel, just fill it with that color
2574         if (texture->mipmap[mip][1] == 4)
2575         {
2576                 unsigned int k = *((const unsigned int *)pixelbase);
2577                 for (x = startx;x < endx;x++)
2578                         outi[x] = k;
2579                 return;
2580         }
2581         affine = zf[startx] == zf[endx-1];
2582         filter = texture->filter & DPSOFTRAST_TEXTURE_FILTER_LINEAR;
2583         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2584         flags = texture->flags;
2585         tcsize = _mm_shuffle_epi32(_mm_loadu_si128((const __m128i *)&texture->mipmap[mip][0]), _MM_SHUFFLE(3, 2, 3, 2));
2586         tcmask = _mm_sub_epi32(tcsize, _mm_set1_epi32(1));
2587         tcscale = _mm_cvtepi32_ps(tcsize);
2588         data = _mm_mul_ps(_mm_movelh_ps(data, data), tcscale);
2589         slope = _mm_mul_ps(_mm_movelh_ps(slope, slope), tcscale);
2590         endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2591         if (filter)
2592                 endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2593         endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2594         tcoffset = _mm_add_epi32(_mm_slli_epi32(_mm_shuffle_epi32(tcsize, _MM_SHUFFLE(0, 0, 0, 0)), 18), _mm_set1_epi32(4));
2595         tcmax = _mm_packs_epi32(tcmask, tcmask);
2596         for (x = startx;x < endx;)
2597         {
2598                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
2599                 __m128 subscale = _mm_set1_ps(65536.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
2600                 if (nextsub >= endx || affine)
2601                 {
2602                         nextsub = endsub = endx-1;
2603                         if (x < nextsub) subscale = _mm_set1_ps(65536.0f / (nextsub - x));
2604                 }       
2605                 tc = endtc;
2606                 subtc = endsubtc;
2607                 endtc = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
2608                 if (filter)
2609                         endtc = _mm_sub_ps(endtc, _mm_set1_ps(0.5f));
2610                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endtc, tc), subscale));
2611                 endsubtc = _mm_cvtps_epi32(_mm_mul_ps(endtc, _mm_set1_ps(65536.0f)));
2612                 subtc = _mm_unpacklo_epi64(subtc, _mm_add_epi32(subtc, substep));
2613                 substep = _mm_slli_epi32(substep, 1);
2614                 if (filter)
2615                 {
2616                         __m128i tcrange = _mm_srai_epi32(_mm_unpacklo_epi64(subtc, _mm_add_epi32(endsubtc, substep)), 16);
2617                         if (_mm_movemask_epi8(_mm_andnot_si128(_mm_cmplt_epi32(tcrange, _mm_setzero_si128()), _mm_cmplt_epi32(tcrange, tcmask))) == 0xFFFF)
2618                         {
2619                                 int stride = _mm_cvtsi128_si32(tcoffset)>>16;
2620                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2621                                 {
2622                                         const unsigned char * RESTRICT ptr1, * RESTRICT ptr2;                   
2623                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, pix3, pix4, fracm;
2624                                         tci = _mm_madd_epi16(tci, tcoffset);
2625                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2626                                         ptr2 = pixelbase + _mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)));
2627                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2628                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2629                                         pix3 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr2), _mm_setzero_si128());
2630                                         pix4 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr2 + stride)), _mm_setzero_si128());
2631                                         fracm = _mm_srli_epi16(subtc, 1);
2632                                         pix1 = _mm_add_epi16(pix1,
2633                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2634                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2635                                         pix3 = _mm_add_epi16(pix3,
2636                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2637                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2638                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2639                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2640                                         pix2 = _mm_add_epi16(pix2,
2641                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2642                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2643                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2644                                 }
2645                                 if (x <= endsub)
2646                                 {
2647                                         const unsigned char * RESTRICT ptr1;
2648                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), pix1, pix2, fracm;
2649                                         tci = _mm_madd_epi16(tci, tcoffset);
2650                                         ptr1 = pixelbase + _mm_cvtsi128_si32(tci);
2651                                         pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)ptr1), _mm_setzero_si128());
2652                                         pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(ptr1 + stride)), _mm_setzero_si128());
2653                                         fracm = _mm_srli_epi16(subtc, 1);
2654                                         pix1 = _mm_add_epi16(pix1,
2655                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2656                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2657                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2658                                         pix1 = _mm_add_epi16(pix1,
2659                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2660                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2661                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2662                                         x++;
2663                                 }
2664                         }
2665                         else if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2666                         {
2667                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2668                                 {
2669                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2670                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2671                                         tci = _mm_madd_epi16(tci, tcoffset);
2672                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2673                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2674                                                                                         _mm_setzero_si128());
2675                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2676                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2677                                                                                         _mm_setzero_si128());
2678                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2679                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2680                                         tci = _mm_madd_epi16(tci, tcoffset);
2681                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2682                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2683                                                                                         _mm_setzero_si128());
2684                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2685                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2686                                                                                         _mm_setzero_si128());
2687                                         fracm = _mm_srli_epi16(subtc, 1);
2688                                         pix1 = _mm_add_epi16(pix1,
2689                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2690                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2691                                         pix3 = _mm_add_epi16(pix3,
2692                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2693                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2694                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2695                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2696                                         pix2 = _mm_add_epi16(pix2,
2697                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2698                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2699                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2700                                 }
2701                                 if (x <= endsub)
2702                                 {
2703                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2704                                         tci = _mm_min_epi16(_mm_max_epi16(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), _mm_setzero_si128()), tcmax);
2705                                         tci = _mm_madd_epi16(tci, tcoffset);
2706                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]), 
2707                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])), 
2708                                                                                         _mm_setzero_si128());
2709                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]), 
2710                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])), 
2711                                                                                         _mm_setzero_si128());
2712                                         fracm = _mm_srli_epi16(subtc, 1);
2713                                         pix1 = _mm_add_epi16(pix1,
2714                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2715                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2716                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2717                                         pix1 = _mm_add_epi16(pix1,
2718                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2719                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2720                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2721                                         x++;
2722                                 }
2723                         }
2724                         else
2725                         {
2726                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2727                                 {
2728                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, pix3, pix4, fracm;
2729                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2730                                         tci = _mm_madd_epi16(tci, tcoffset);
2731                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2732                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2733                                                                                         _mm_setzero_si128());
2734                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2735                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2736                                                                                         _mm_setzero_si128());
2737                                         tci = _mm_shuffle_epi32(_mm_shufflehi_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 2, 3, 2));
2738                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2739                                         tci = _mm_madd_epi16(tci, tcoffset);
2740                                         pix3 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),
2741                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2742                                                                                         _mm_setzero_si128());
2743                                         pix4 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2744                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2745                                                                                         _mm_setzero_si128());
2746                                         fracm = _mm_srli_epi16(subtc, 1);
2747                                         pix1 = _mm_add_epi16(pix1,
2748                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2749                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2750                                         pix3 = _mm_add_epi16(pix3,
2751                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix3), 1),
2752                                                                                                                  _mm_shuffle_epi32(_mm_shufflehi_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(3, 2, 3, 2))));
2753                                         pix2 = _mm_unpacklo_epi64(pix1, pix3);
2754                                         pix4 = _mm_unpackhi_epi64(pix1, pix3);
2755                                         pix2 = _mm_add_epi16(pix2,
2756                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix4, pix2), 1),
2757                                                                                                                  _mm_shufflehi_epi16(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0)), _MM_SHUFFLE(0, 0, 0, 0))));
2758                                         _mm_storel_epi64((__m128i *)&outi[x], _mm_packus_epi16(pix2, _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 2, 3, 2))));
2759                                 }
2760                                 if (x <= endsub)
2761                                 {
2762                                         __m128i tci = _mm_shuffle_epi32(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(1, 0, 1, 0)), pix1, pix2, fracm;
2763                                         tci = _mm_and_si128(_mm_add_epi16(tci, _mm_setr_epi32(0, 1, 0x10000, 0x10001)), tcmax);
2764                                         tci = _mm_madd_epi16(tci, tcoffset);
2765                                         pix1 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(tci)]),                                                                                        
2766                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(1, 1, 1, 1)))])),
2767                                                                                         _mm_setzero_si128());
2768                                         pix2 = _mm_unpacklo_epi8(_mm_unpacklo_epi32(_mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))]),
2769                                                                                                                                 _mm_cvtsi32_si128(*(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(3, 3, 3, 3)))])),
2770                                                                                         _mm_setzero_si128());
2771                                         fracm = _mm_srli_epi16(subtc, 1);
2772                                         pix1 = _mm_add_epi16(pix1,
2773                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2774                                                                                                                  _mm_shuffle_epi32(_mm_shufflelo_epi16(fracm, _MM_SHUFFLE(2, 2, 2, 2)), _MM_SHUFFLE(1, 0, 1, 0))));
2775                                         pix2 = _mm_shuffle_epi32(pix1, _MM_SHUFFLE(3, 2, 3, 2));
2776                                         pix1 = _mm_add_epi16(pix1,
2777                                                                                  _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 1),
2778                                                                                                                  _mm_shufflelo_epi16(fracm, _MM_SHUFFLE(0, 0, 0, 0))));
2779                                         outi[x] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
2780                                         x++;
2781                                 }
2782                         }
2783                 }
2784                 else
2785                 {
2786                         if (flags & DPSOFTRAST_TEXTURE_FLAG_CLAMPTOEDGE)
2787                         {
2788                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2789                                 {
2790                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2791                                         tci = _mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax); 
2792                                         tci = _mm_madd_epi16(tci, tcoffset);
2793                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2794                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2795                                 }
2796                                 if (x <= endsub)
2797                                 {
2798                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2799                                         tci =_mm_min_epi16(_mm_max_epi16(tci, _mm_setzero_si128()), tcmax);
2800                                         tci = _mm_madd_epi16(tci, tcoffset);
2801                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2802                                         x++;
2803                                 }
2804                         }
2805                         else
2806                         {
2807                                 for (; x + 1 <= endsub; x += 2, subtc = _mm_add_epi32(subtc, substep))
2808                                 {
2809                                         __m128i tci = _mm_shufflehi_epi16(_mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1)), _MM_SHUFFLE(3, 1, 3, 1));
2810                                         tci = _mm_and_si128(tci, tcmax); 
2811                                         tci = _mm_madd_epi16(tci, tcoffset);
2812                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2813                                         outi[x+1] = *(const int *)&pixelbase[_mm_cvtsi128_si32(_mm_shuffle_epi32(tci, _MM_SHUFFLE(2, 2, 2, 2)))];
2814                                 }
2815                                 if (x <= endsub)
2816                                 {
2817                                         __m128i tci = _mm_shufflelo_epi16(subtc, _MM_SHUFFLE(3, 1, 3, 1));
2818                                         tci = _mm_and_si128(tci, tcmax); 
2819                                         tci = _mm_madd_epi16(tci, tcoffset);
2820                                         outi[x] = *(const int *)&pixelbase[_mm_cvtsi128_si32(tci)];
2821                                         x++;
2822                                 }
2823                         }
2824                 }
2825         }
2826 #endif
2827 }
2828
2829 static void DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char * RESTRICT out4ub, int texunitindex, int arrayindex, const float * RESTRICT zf)
2830 {
2831         // TODO: IMPLEMENT
2832         memset(out4ub + span->startx*4, 255, (span->startx - span->endx)*4);
2833 }
2834
2835 static float DPSOFTRAST_SampleShadowmap(const float *vector)
2836 {
2837         // TODO: IMPLEMENT
2838         return 1.0f;
2839 }
2840
2841 #if 0
2842 static void DPSOFTRAST_Draw_Span_MultiplyVarying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, int arrayindex, const float *zf)
2843 {
2844         int x;
2845         int startx = span->startx;
2846         int endx = span->endx;
2847         float c[4];
2848         float data[4];
2849         float slope[4];
2850         float z;
2851         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2852         for (x = startx;x < endx;x++)
2853         {
2854                 z = zf[x];
2855                 c[0] = (data[0] + slope[0]*x) * z;
2856                 c[1] = (data[1] + slope[1]*x) * z;
2857                 c[2] = (data[2] + slope[2]*x) * z;
2858                 c[3] = (data[3] + slope[3]*x) * z;
2859                 out4f[x*4+0] = in4f[x*4+0] * c[0];
2860                 out4f[x*4+1] = in4f[x*4+1] * c[1];
2861                 out4f[x*4+2] = in4f[x*4+2] * c[2];
2862                 out4f[x*4+3] = in4f[x*4+3] * c[3];
2863         }
2864 }
2865 #endif
2866
2867 #if 0
2868 static void DPSOFTRAST_Draw_Span_Varying(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, int arrayindex, const float *zf)
2869 {
2870         int x;
2871         int startx = span->startx;
2872         int endx = span->endx;
2873         float c[4];
2874         float data[4];
2875         float slope[4];
2876         float z;
2877         DPSOFTRAST_CALCATTRIB4F(triangle, span, data, slope, arrayindex);
2878         for (x = startx;x < endx;x++)
2879         {
2880                 z = zf[x];
2881                 c[0] = (data[0] + slope[0]*x) * z;
2882                 c[1] = (data[1] + slope[1]*x) * z;
2883                 c[2] = (data[2] + slope[2]*x) * z;
2884                 c[3] = (data[3] + slope[3]*x) * z;
2885                 out4f[x*4+0] = c[0];
2886                 out4f[x*4+1] = c[1];
2887                 out4f[x*4+2] = c[2];
2888                 out4f[x*4+3] = c[3];
2889         }
2890 }
2891 #endif
2892
2893 #if 0
2894 static void DPSOFTRAST_Draw_Span_AddBloom(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f, const float *subcolor)
2895 {
2896         int x, startx = span->startx, endx = span->endx;
2897         float c[4], localcolor[4];
2898         localcolor[0] = subcolor[0];
2899         localcolor[1] = subcolor[1];
2900         localcolor[2] = subcolor[2];
2901         localcolor[3] = subcolor[3];
2902         for (x = startx;x < endx;x++)
2903         {
2904                 c[0] = inb4f[x*4+0] - localcolor[0];if (c[0] < 0.0f) c[0] = 0.0f;
2905                 c[1] = inb4f[x*4+1] - localcolor[1];if (c[1] < 0.0f) c[1] = 0.0f;
2906                 c[2] = inb4f[x*4+2] - localcolor[2];if (c[2] < 0.0f) c[2] = 0.0f;
2907                 c[3] = inb4f[x*4+3] - localcolor[3];if (c[3] < 0.0f) c[3] = 0.0f;
2908                 out4f[x*4+0] = ina4f[x*4+0] + c[0];
2909                 out4f[x*4+1] = ina4f[x*4+1] + c[1];
2910                 out4f[x*4+2] = ina4f[x*4+2] + c[2];
2911                 out4f[x*4+3] = ina4f[x*4+3] + c[3];
2912         }
2913 }
2914 #endif
2915
2916 #if 0
2917 static void DPSOFTRAST_Draw_Span_MultiplyBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2918 {
2919         int x, startx = span->startx, endx = span->endx;
2920         for (x = startx;x < endx;x++)
2921         {
2922                 out4f[x*4+0] = ina4f[x*4+0] * inb4f[x*4+0];
2923                 out4f[x*4+1] = ina4f[x*4+1] * inb4f[x*4+1];
2924                 out4f[x*4+2] = ina4f[x*4+2] * inb4f[x*4+2];
2925                 out4f[x*4+3] = ina4f[x*4+3] * inb4f[x*4+3];
2926         }
2927 }
2928 #endif
2929
2930 #if 0
2931 static void DPSOFTRAST_Draw_Span_AddBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2932 {
2933         int x, startx = span->startx, endx = span->endx;
2934         for (x = startx;x < endx;x++)
2935         {
2936                 out4f[x*4+0] = ina4f[x*4+0] + inb4f[x*4+0];
2937                 out4f[x*4+1] = ina4f[x*4+1] + inb4f[x*4+1];
2938                 out4f[x*4+2] = ina4f[x*4+2] + inb4f[x*4+2];
2939                 out4f[x*4+3] = ina4f[x*4+3] + inb4f[x*4+3];
2940         }
2941 }
2942 #endif
2943
2944 #if 0
2945 static void DPSOFTRAST_Draw_Span_MixBuffers(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *ina4f, const float *inb4f)
2946 {
2947         int x, startx = span->startx, endx = span->endx;
2948         float a, b;
2949         for (x = startx;x < endx;x++)
2950         {
2951                 a = 1.0f - inb4f[x*4+3];
2952                 b = inb4f[x*4+3];
2953                 out4f[x*4+0] = ina4f[x*4+0] * a + inb4f[x*4+0] * b;
2954                 out4f[x*4+1] = ina4f[x*4+1] * a + inb4f[x*4+1] * b;
2955                 out4f[x*4+2] = ina4f[x*4+2] * a + inb4f[x*4+2] * b;
2956                 out4f[x*4+3] = ina4f[x*4+3] * a + inb4f[x*4+3] * b;
2957         }
2958 }
2959 #endif
2960
2961 #if 0
2962 static void DPSOFTRAST_Draw_Span_MixUniformColor(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, float *out4f, const float *in4f, const float *color)
2963 {
2964         int x, startx = span->startx, endx = span->endx;
2965         float localcolor[4], ilerp, lerp;
2966         localcolor[0] = color[0];
2967         localcolor[1] = color[1];
2968         localcolor[2] = color[2];
2969         localcolor[3] = color[3];
2970         ilerp = 1.0f - localcolor[3];
2971         lerp = localcolor[3];
2972         for (x = startx;x < endx;x++)
2973         {
2974                 out4f[x*4+0] = in4f[x*4+0] * ilerp + localcolor[0] * lerp;
2975                 out4f[x*4+1] = in4f[x*4+1] * ilerp + localcolor[1] * lerp;
2976                 out4f[x*4+2] = in4f[x*4+2] * ilerp + localcolor[2] * lerp;
2977                 out4f[x*4+3] = in4f[x*4+3] * ilerp + localcolor[3] * lerp;
2978         }
2979 }
2980 #endif
2981
2982
2983
2984 static void DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, int arrayindex, const float *zf)
2985 {
2986 #ifdef SSE_POSSIBLE
2987         int x;
2988         int startx = span->startx;
2989         int endx = span->endx;
2990         __m128 data, slope;
2991         __m128 mod, endmod;
2992         __m128i submod, substep, endsubmod;
2993         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
2994         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
2995         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
2996         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
2997         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
2998         for (x = startx; x < endx;)
2999         {
3000                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3001                 __m128 subscale = _mm_set1_ps(256.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3002                 if (nextsub >= endx)
3003                 {
3004                         nextsub = endsub = endx-1;
3005                         if (x < nextsub) subscale = _mm_set1_ps(256.0f / (nextsub - x));
3006                 }
3007                 mod = endmod;
3008                 submod = endsubmod;
3009                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3010                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3011                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(256.0f)));
3012                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3013                 substep = _mm_packs_epi32(substep, substep);
3014                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3015                 {
3016                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&in4ub[x*4]));
3017                         pix = _mm_mulhi_epu16(pix, submod);
3018                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3019                 }
3020                 if (x <= endsub)
3021                 {
3022                         __m128i pix = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&in4ub[x*4]));
3023                         pix = _mm_mulhi_epu16(pix, submod);
3024                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3025                         x++;
3026                 }
3027         }
3028 #endif
3029 }
3030
3031 static void DPSOFTRAST_Draw_Span_VaryingBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, int arrayindex, const float *zf)
3032 {
3033 #ifdef SSE_POSSIBLE
3034         int x;
3035         int startx = span->startx;
3036         int endx = span->endx;
3037         __m128 data, slope;
3038         __m128 mod, endmod;
3039         __m128i submod, substep, endsubmod;
3040         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3041         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3042         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3043         endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx))), _mm_load1_ps(&zf[startx]));
3044         endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3045         for (x = startx; x < endx;)
3046         {
3047                 int nextsub = x + DPSOFTRAST_DRAW_MAXSUBSPAN, endsub = nextsub - 1;
3048                 __m128 subscale = _mm_set1_ps(4095.0f/DPSOFTRAST_DRAW_MAXSUBSPAN);
3049                 if (nextsub >= endx)
3050                 {
3051                         nextsub = endsub = endx-1;
3052                         if (x < nextsub) subscale = _mm_set1_ps(4095.0f / (nextsub - x));
3053                 }
3054                 mod = endmod;
3055                 submod = endsubmod;
3056                 endmod = _mm_mul_ps(_mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(nextsub))), _mm_load1_ps(&zf[nextsub]));
3057                 substep = _mm_cvtps_epi32(_mm_mul_ps(_mm_sub_ps(endmod, mod), subscale));
3058                 endsubmod = _mm_cvtps_epi32(_mm_mul_ps(endmod, _mm_set1_ps(4095.0f)));
3059                 submod = _mm_packs_epi32(submod, _mm_add_epi32(submod, substep));
3060                 substep = _mm_packs_epi32(substep, substep);
3061                 for (; x + 1 <= endsub; x += 2, submod = _mm_add_epi16(submod, substep))
3062                 {
3063                         __m128i pix = _mm_srai_epi16(submod, 4);
3064                         _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3065                 }
3066                 if (x <= endsub)
3067                 {
3068                         __m128i pix = _mm_srai_epi16(submod, 4);
3069                         *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3070                         x++;
3071                 }
3072         }
3073 #endif
3074 }
3075
3076 static void DPSOFTRAST_Draw_Span_AddBloomBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *subcolor)
3077 {
3078 #ifdef SSE_POSSIBLE
3079         int x, startx = span->startx, endx = span->endx;
3080         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(subcolor), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3081         localcolor = _mm_packs_epi32(localcolor, localcolor);
3082         for (x = startx;x+2 <= endx;x+=2)
3083         {
3084                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3085                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3086                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3087                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3088         }
3089         if (x < endx)
3090         {
3091                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3092                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3093                 pix1 = _mm_add_epi16(pix1, _mm_subs_epu16(pix2, localcolor));
3094                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3095         }
3096 #endif
3097 }
3098
3099 static void DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3100 {
3101 #ifdef SSE_POSSIBLE
3102         int x, startx = span->startx, endx = span->endx;
3103         for (x = startx;x+2 <= endx;x+=2)
3104         {
3105                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3106                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3107                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3108                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3109         }
3110         if (x < endx)
3111         {
3112                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3113                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3114                 pix1 = _mm_mulhi_epu16(pix1, pix2);
3115                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3116         }
3117 #endif
3118 }
3119
3120 static void DPSOFTRAST_Draw_Span_AddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3121 {
3122 #ifdef SSE_POSSIBLE
3123         int x, startx = span->startx, endx = span->endx;
3124         for (x = startx;x+2 <= endx;x+=2)
3125         {
3126                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3127                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3128                 pix1 = _mm_add_epi16(pix1, pix2);
3129                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3130         }
3131         if (x < endx)
3132         {
3133                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3134                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3135                 pix1 = _mm_add_epi16(pix1, pix2);
3136                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3137         }
3138 #endif
3139 }
3140
3141 #if 0
3142 static void DPSOFTRAST_Draw_Span_TintedAddBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub, const float *inbtintbgra)
3143 {
3144 #ifdef SSE_POSSIBLE
3145         int x, startx = span->startx, endx = span->endx;
3146         __m128i tint = _mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(inbtintbgra), _mm_set1_ps(256.0f)));
3147         tint = _mm_packs_epi32(tint, tint);
3148         for (x = startx;x+2 <= endx;x+=2)
3149         {
3150                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3151                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_loadl_epi64((const __m128i *)&inb4ub[x*4]));
3152                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3153                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3154         }
3155         if (x < endx)
3156         {
3157                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3158                 __m128i pix2 = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]));
3159                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epu16(tint, pix2));
3160                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3161         }
3162 #endif
3163 }
3164 #endif
3165
3166 static void DPSOFTRAST_Draw_Span_MixBuffersBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *ina4ub, const unsigned char *inb4ub)
3167 {
3168 #ifdef SSE_POSSIBLE
3169         int x, startx = span->startx, endx = span->endx;
3170         for (x = startx;x+2 <= endx;x+=2)
3171         {
3172                 __m128i pix1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&ina4ub[x*4]), _mm_setzero_si128());
3173                 __m128i pix2 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&inb4ub[x*4]), _mm_setzero_si128());
3174                 __m128i blend = _mm_shufflehi_epi16(_mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3));
3175                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3176                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix1, pix1));
3177         }
3178         if (x < endx)
3179         {
3180                 __m128i pix1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&ina4ub[x*4]), _mm_setzero_si128());
3181                 __m128i pix2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&inb4ub[x*4]), _mm_setzero_si128());
3182                 __m128i blend = _mm_shufflelo_epi16(pix2, _MM_SHUFFLE(3, 3, 3, 3));
3183                 pix1 = _mm_add_epi16(pix1, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(pix2, pix1), 4), _mm_slli_epi16(blend, 4)));
3184                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix1, pix1));
3185         }
3186 #endif
3187 }
3188
3189 static void DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span, unsigned char *out4ub, const unsigned char *in4ub, const float *color)
3190 {
3191 #ifdef SSE_POSSIBLE
3192         int x, startx = span->startx, endx = span->endx;
3193         __m128i localcolor = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_loadu_ps(color), _mm_set1_ps(255.0f))), _MM_SHUFFLE(3, 0, 1, 2)), blend;
3194         localcolor = _mm_packs_epi32(localcolor, localcolor);
3195         blend = _mm_slli_epi16(_mm_shufflehi_epi16(_mm_shufflelo_epi16(localcolor, _MM_SHUFFLE(3, 3, 3, 3)), _MM_SHUFFLE(3, 3, 3, 3)), 4);
3196         for (x = startx;x+2 <= endx;x+=2)
3197         {
3198                 __m128i pix = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)&in4ub[x*4]), _mm_setzero_si128());
3199                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3200                 _mm_storel_epi64((__m128i *)&out4ub[x*4], _mm_packus_epi16(pix, pix));
3201         }
3202         if (x < endx)
3203         {
3204                 __m128i pix = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)&in4ub[x*4]), _mm_setzero_si128());
3205                 pix = _mm_add_epi16(pix, _mm_mulhi_epi16(_mm_slli_epi16(_mm_sub_epi16(localcolor, pix), 4), blend));
3206                 *(int *)&out4ub[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3207         }
3208 #endif
3209 }
3210
3211
3212
3213 static void DPSOFTRAST_VertexShader_Generic(void)
3214 {
3215         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3216         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3217         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3218         if (dpsoftrast.shader_permutation & SHADERPERMUTATION_SPECULAR)
3219                 DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3220 }
3221
3222 static void DPSOFTRAST_PixelShader_Generic(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3223 {
3224         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3225         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3226         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3227         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3228         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3229         if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3230         {
3231                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_FIRST, 2, buffer_z);
3232                 DPSOFTRAST_Draw_Span_MultiplyVaryingBGRA8(triangle, span, buffer_FragColorbgra8, buffer_texture_colorbgra8, 1, buffer_z);
3233                 if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3234                 {
3235                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_SECOND, 2, buffer_z);
3236                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3237                         {
3238                                 // multiply
3239                                 DPSOFTRAST_Draw_Span_MultiplyBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3240                         }
3241                         else if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3242                         {
3243                                 // add
3244                                 DPSOFTRAST_Draw_Span_AddBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3245                         }
3246                         else if (thread->shader_permutation & SHADERPERMUTATION_VERTEXTEXTUREBLEND)
3247                         {
3248                                 // alphablend
3249                                 DPSOFTRAST_Draw_Span_MixBuffersBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_lightmapbgra8);
3250                         }
3251                 }
3252         }
3253         else
3254                 DPSOFTRAST_Draw_Span_VaryingBGRA8(triangle, span, buffer_FragColorbgra8, 1, buffer_z);
3255         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3256 }
3257
3258
3259
3260 static void DPSOFTRAST_VertexShader_PostProcess(void)
3261 {
3262         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3263         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0);
3264         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD4);
3265 }
3266
3267 static void DPSOFTRAST_PixelShader_PostProcess(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3268 {
3269         // TODO: optimize!!  at the very least there is no reason to use texture sampling on the frame texture
3270         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3271         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3272         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3273         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3274         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_FragColorbgra8, GL20TU_FIRST, 2, buffer_z);
3275         if (thread->shader_permutation & SHADERPERMUTATION_BLOOM)
3276         {
3277                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_SECOND, 3, buffer_z);
3278                 DPSOFTRAST_Draw_Span_AddBloomBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, buffer_texture_colorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_BloomColorSubtract * 4);
3279         }
3280         DPSOFTRAST_Draw_Span_MixUniformColorBGRA8(triangle, span, buffer_FragColorbgra8, buffer_FragColorbgra8, thread->uniform4f + DPSOFTRAST_UNIFORM_ViewTintColor * 4);
3281         if (thread->shader_permutation & SHADERPERMUTATION_SATURATION)
3282         {
3283                 // TODO: implement saturation
3284         }
3285         if (thread->shader_permutation & SHADERPERMUTATION_GAMMARAMPS)
3286         {
3287                 // TODO: implement gammaramps
3288         }
3289         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3290 }
3291
3292
3293
3294 static void DPSOFTRAST_VertexShader_Depth_Or_Shadow(void)
3295 {
3296         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3297 }
3298
3299 static void DPSOFTRAST_PixelShader_Depth_Or_Shadow(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3300 {
3301         // this is never called (because colormask is off when this shader is used)
3302         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3303         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3304         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3305         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
3306         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3307 }
3308
3309
3310
3311 static void DPSOFTRAST_VertexShader_FlatColor(void)
3312 {
3313         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3314         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3315 }
3316
3317 static void DPSOFTRAST_PixelShader_FlatColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3318 {
3319 #ifdef SSE_POSSIBLE
3320         unsigned char * RESTRICT pixelmask = span->pixelmask;
3321         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3322         int x, startx = span->startx, endx = span->endx;
3323         __m128i Color_Ambientm;
3324         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3325         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3326         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3327         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3328         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3329         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3330                 pixel = buffer_FragColorbgra8;
3331         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3332         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3333         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3334         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3335         for (x = startx;x < endx;x++)
3336         {
3337                 __m128i color, pix;
3338                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3339                 {
3340                         __m128i pix2;
3341                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3342                         pix = _mm_mulhi_epu16(Color_Ambientm, _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3343                         pix2 = _mm_mulhi_epu16(Color_Ambientm, _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3344                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3345                         x += 3;
3346                         continue;
3347                 }
3348                 if (!pixelmask[x])
3349                         continue;
3350                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3351                 pix = _mm_mulhi_epu16(Color_Ambientm, color);
3352                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3353         }
3354         if (pixel == buffer_FragColorbgra8)
3355                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3356 #endif
3357 }
3358
3359
3360
3361 static void DPSOFTRAST_VertexShader_VertexColor(void)
3362 {
3363         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3364         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_COLOR);
3365         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3366 }
3367
3368 static void DPSOFTRAST_PixelShader_VertexColor(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3369 {
3370 #ifdef SSE_POSSIBLE
3371         unsigned char * RESTRICT pixelmask = span->pixelmask;
3372         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3373         int x, startx = span->startx, endx = span->endx;
3374         __m128i Color_Ambientm, Color_Diffusem;
3375         __m128 data, slope;
3376         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3377         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3378         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3379         int arrayindex = DPSOFTRAST_ARRAY_COLOR;
3380         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3381         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, 2, buffer_z);
3382         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3383                 pixel = buffer_FragColorbgra8;
3384         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3385         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3386         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3387         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3388         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(4096.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3389         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3390         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3391         DPSOFTRAST_CALCATTRIB(triangle, span, data, slope, arrayindex);
3392         data = _mm_shuffle_ps(data, data, _MM_SHUFFLE(3, 0, 1, 2));
3393         slope = _mm_shuffle_ps(slope, slope, _MM_SHUFFLE(3, 0, 1, 2));
3394         data = _mm_add_ps(data, _mm_mul_ps(slope, _mm_set1_ps(startx)));
3395         data = _mm_mul_ps(data, _mm_set1_ps(4096.0f));
3396         slope = _mm_mul_ps(slope, _mm_set1_ps(4096.0f));
3397         for (x = startx;x < endx;x++, data = _mm_add_ps(data, slope))
3398         {
3399                 __m128i color, mod, pix;
3400                 if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3401                 {
3402                         __m128i pix2, mod2;
3403                         __m128 z = _mm_loadu_ps(&buffer_z[x]);
3404                         color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3405                         mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(0, 0, 0, 0))));
3406                         data = _mm_add_ps(data, slope);
3407                         mod = _mm_packs_epi32(mod, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(1, 1, 1, 1)))));
3408                         data = _mm_add_ps(data, slope);
3409                         mod2 = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(2, 2, 2, 2))));
3410                         data = _mm_add_ps(data, slope);
3411                         mod2 = _mm_packs_epi32(mod2, _mm_cvtps_epi32(_mm_mul_ps(data, _mm_shuffle_ps(z, z, _MM_SHUFFLE(3, 3, 3, 3)))));
3412                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod), Color_Ambientm),
3413                                                                   _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3414                         pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, mod2), Color_Ambientm),
3415                                                                    _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3416                         _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3417                         x += 3;
3418                         continue;
3419                 }
3420                 if (!pixelmask[x])
3421                         continue;
3422                 color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3423                 mod = _mm_cvtps_epi32(_mm_mul_ps(data, _mm_load1_ps(&buffer_z[x]))); 
3424                 mod = _mm_packs_epi32(mod, mod);
3425                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(mod, Color_Diffusem), Color_Ambientm), color);
3426                 *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3427         }
3428         if (pixel == buffer_FragColorbgra8)
3429                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3430 #endif
3431 }
3432
3433
3434
3435 static void DPSOFTRAST_VertexShader_Lightmap(void)
3436 {
3437         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3438         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3439         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3440 }
3441
3442 static void DPSOFTRAST_PixelShader_Lightmap(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3443 {
3444 #ifdef SSE_POSSIBLE
3445         unsigned char * RESTRICT pixelmask = span->pixelmask;
3446         unsigned char * RESTRICT pixel = (unsigned char *)dpsoftrast.fb_colorpixels[0] + (span->y * dpsoftrast.fb_width + span->x) * 4;
3447         int x, startx = span->startx, endx = span->endx;
3448         __m128i Color_Ambientm, Color_Diffusem, Color_Glowm, Color_AmbientGlowm;
3449         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3450         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3451         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3452         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3453         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3454         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3455         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3456         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3457         if ((thread->shader_permutation & SHADERPERMUTATION_ALPHAKILL) || thread->fb_blendmode != DPSOFTRAST_BLENDMODE_OPAQUE)
3458                 pixel = buffer_FragColorbgra8;
3459         Color_Ambientm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3460         Color_Ambientm = _mm_and_si128(Color_Ambientm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3461         Color_Ambientm = _mm_or_si128(Color_Ambientm, _mm_setr_epi32(0, 0, 0, (int)(thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0]*255.0f)));
3462         Color_Ambientm = _mm_packs_epi32(Color_Ambientm, Color_Ambientm);
3463         Color_Diffusem = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3464         Color_Diffusem = _mm_and_si128(Color_Diffusem, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3465         Color_Diffusem = _mm_packs_epi32(Color_Diffusem, Color_Diffusem);
3466         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3467         {
3468                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3469                 Color_Glowm = _mm_shuffle_epi32(_mm_cvtps_epi32(_mm_mul_ps(_mm_load_ps(&thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4]), _mm_set1_ps(256.0f))), _MM_SHUFFLE(3, 0, 1, 2));
3470                 Color_Glowm = _mm_and_si128(Color_Glowm, _mm_setr_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0));
3471                 Color_Glowm = _mm_packs_epi32(Color_Glowm, Color_Glowm);
3472                 Color_AmbientGlowm = _mm_unpacklo_epi64(Color_Ambientm, Color_Glowm);
3473                 for (x = startx;x < endx;x++)
3474                 {
3475                         __m128i color, lightmap, glow, pix;
3476                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3477                         {
3478                                 __m128i pix2;
3479                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3480                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3481                                 glow = _mm_loadu_si128((const __m128i *)&buffer_texture_glowbgra8[x*4]);
3482                                 pix = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3483                                                                                                         _mm_unpacklo_epi8(_mm_setzero_si128(), color)),
3484                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpacklo_epi8(_mm_setzero_si128(), glow)));
3485                                 pix2 = _mm_add_epi16(_mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3486                                                                                                         _mm_unpackhi_epi8(_mm_setzero_si128(), color)),
3487                                                                         _mm_mulhi_epu16(Color_Glowm, _mm_unpackhi_epi8(_mm_setzero_si128(), glow)));
3488                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3489                                 x += 3;
3490                                 continue;
3491                         }
3492                         if (!pixelmask[x])
3493                                 continue;
3494                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3495                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3496                         glow = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_glowbgra8[x*4]));
3497                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, lightmap), Color_AmbientGlowm), _mm_unpacklo_epi64(color, glow));
3498                         pix = _mm_add_epi16(pix, _mm_shuffle_epi32(pix, _MM_SHUFFLE(3, 2, 3, 2)));
3499                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3500                 }
3501         }
3502         else
3503         {
3504                 for (x = startx;x < endx;x++)
3505                 {
3506                         __m128i color, lightmap, pix;
3507                         if (x + 4 <= endx && *(const unsigned int *)&pixelmask[x] == 0x01010101)
3508                         {
3509                                 __m128i pix2;
3510                                 color = _mm_loadu_si128((const __m128i *)&buffer_texture_colorbgra8[x*4]);
3511                                 lightmap = _mm_loadu_si128((const __m128i *)&buffer_texture_lightmapbgra8[x*4]);
3512                                 pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpacklo_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm), 
3513                                                                           _mm_unpacklo_epi8(_mm_setzero_si128(), color));
3514                                 pix2 = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(Color_Diffusem, _mm_unpackhi_epi8(_mm_setzero_si128(), lightmap)), Color_Ambientm),
3515                                                                            _mm_unpackhi_epi8(_mm_setzero_si128(), color));
3516                                 _mm_storeu_si128((__m128i *)&pixel[x*4], _mm_packus_epi16(pix, pix2));
3517                                 x += 3;
3518                                 continue;
3519                         }
3520                         if (!pixelmask[x]) 
3521                                 continue;
3522                         color = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_colorbgra8[x*4]));
3523                         lightmap = _mm_unpacklo_epi8(_mm_setzero_si128(), _mm_cvtsi32_si128(*(const int *)&buffer_texture_lightmapbgra8[x*4]));
3524                         pix = _mm_mulhi_epu16(_mm_add_epi16(_mm_mulhi_epu16(lightmap, Color_Diffusem), Color_Ambientm), color);
3525                         *(int *)&pixel[x*4] = _mm_cvtsi128_si32(_mm_packus_epi16(pix, pix));
3526                 }
3527         }
3528         if (pixel == buffer_FragColorbgra8)
3529                 DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
3530 #endif
3531 }
3532
3533
3534 void DPSOFTRAST_VertexShader_LightDirection(void);
3535 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
3536
3537 static void DPSOFTRAST_VertexShader_FakeLight(void)
3538 {
3539         DPSOFTRAST_VertexShader_LightDirection();
3540 }
3541
3542 static void DPSOFTRAST_PixelShader_FakeLight(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3543 {
3544         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3545 }
3546
3547
3548
3549 static void DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace(void)
3550 {
3551         DPSOFTRAST_VertexShader_LightDirection();
3552         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3553 }
3554
3555 static void DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3556 {
3557         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3558 }
3559
3560
3561
3562 static void DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace(void)
3563 {
3564         DPSOFTRAST_VertexShader_LightDirection();
3565         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
3566 }
3567
3568 static void DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3569 {
3570         DPSOFTRAST_PixelShader_LightDirection(thread, triangle, span);
3571 }
3572
3573
3574
3575 void DPSOFTRAST_VertexShader_LightDirection(void)
3576 {
3577         int i;
3578         int numvertices = dpsoftrast.numvertices;
3579         float LightDir[4];
3580         float LightVector[4];
3581         float EyePosition[4];
3582         float EyeVectorModelSpace[4];
3583         float EyeVector[4];
3584         float position[4];
3585         float svector[4];
3586         float tvector[4];
3587         float normal[4];
3588         LightDir[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+0];
3589         LightDir[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+1];
3590         LightDir[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+2];
3591         LightDir[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightDir*4+3];
3592         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
3593         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
3594         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
3595         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
3596         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
3597         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
3598         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
3599         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
3600         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
3601         for (i = 0;i < numvertices;i++)
3602         {
3603                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
3604                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
3605                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
3606                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
3607                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
3608                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
3609                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
3610                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
3611                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
3612                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
3613                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
3614                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
3615                 LightVector[0] = svector[0] * LightDir[0] + svector[1] * LightDir[1] + svector[2] * LightDir[2];
3616                 LightVector[1] = tvector[0] * LightDir[0] + tvector[1] * LightDir[1] + tvector[2] * LightDir[2];
3617                 LightVector[2] = normal[0] * LightDir[0] + normal[1] * LightDir[1] + normal[2] * LightDir[2];
3618                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+0] = LightVector[0];
3619                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+1] = LightVector[1];
3620                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+2] = LightVector[2];
3621                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD5][i*4+3] = 0.0f;
3622                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
3623                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
3624                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
3625                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
3626                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
3627                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
3628                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
3629                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
3630                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
3631                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
3632         }
3633         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
3634 }
3635
3636 #define DPSOFTRAST_Min(a,b) ((a) < (b) ? (a) : (b))
3637 #define DPSOFTRAST_Max(a,b) ((a) > (b) ? (a) : (b))
3638 #define DPSOFTRAST_Vector3Dot(a,b) ((a)[0]*(b)[0]+(a)[1]*(b)[1]+(a)[2]*(b)[2])
3639 #define DPSOFTRAST_Vector3LengthSquared(v) (DPSOFTRAST_Vector3Dot((v),(v)))
3640 #define DPSOFTRAST_Vector3Length(v) (sqrt(DPSOFTRAST_Vector3LengthSquared(v)))
3641 #define DPSOFTRAST_Vector3Normalize(v)\
3642 do\
3643 {\
3644         float len = sqrt(DPSOFTRAST_Vector3Dot(v,v));\
3645         if (len)\
3646         {\
3647                 len = 1.0f / len;\
3648                 v[0] *= len;\
3649                 v[1] *= len;\
3650                 v[2] *= len;\
3651         }\
3652 }\
3653 while(0)
3654
3655 void DPSOFTRAST_PixelShader_LightDirection(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
3656 {
3657         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
3658         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3659         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3660         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3661         unsigned char buffer_texture_glowbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3662         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3663         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3664         unsigned char buffer_texture_deluxemapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3665         unsigned char buffer_texture_lightmapbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3666         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
3667         int x, startx = span->startx, endx = span->endx;
3668         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
3669         float LightVectordata[4];
3670         float LightVectorslope[4];
3671         float EyeVectordata[4];
3672         float EyeVectorslope[4];
3673         float VectorSdata[4];
3674         float VectorSslope[4];
3675         float VectorTdata[4];
3676         float VectorTslope[4];
3677         float VectorRdata[4];
3678         float VectorRslope[4];
3679         float z;
3680         float diffusetex[4];
3681         float glosstex[4];
3682         float surfacenormal[4];
3683         float lightnormal[4];
3684         float lightnormal_modelspace[4];
3685         float eyenormal[4];
3686         float specularnormal[4];
3687         float diffuse;
3688         float specular;
3689         float SpecularPower;
3690         int d[4];
3691         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
3692         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
3693         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
3694         Color_Glow[3] = 0.0f;
3695         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
3696         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
3697         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
3698         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
3699         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
3700         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
3701         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
3702         Color_Pants[3] = 0.0f;
3703         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
3704         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
3705         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
3706         Color_Shirt[3] = 0.0f;
3707         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
3708         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3709         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3710         {
3711                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3712                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3713         }
3714         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3715         {
3716                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glowbgra8, GL20TU_GLOW, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3717         }
3718         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
3719         {
3720                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3721                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3722                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3723                 Color_Diffuse[3] = 0.0f;
3724                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3725                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3726                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3727                 LightColor[3] = 0.0f;
3728                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3729                 Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
3730                 Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
3731                 Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
3732                 Color_Specular[3] = 0.0f;
3733                 SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
3734                 DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3735                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3736
3737                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3738                 {
3739                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3740                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3741                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3742                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3743                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3744                 }
3745                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3746                 {
3747                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3748                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3749                 }
3750                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3751                 {
3752                         // nothing of this needed
3753                 }
3754                 else
3755                 {
3756                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3757                 }
3758
3759                 for (x = startx;x < endx;x++)
3760                 {
3761                         z = buffer_z[x];
3762                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3763                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3764                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3765                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3766                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
3767                         {
3768                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
3769                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
3770                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
3771                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
3772                         }
3773                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
3774                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
3775                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
3776                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
3777                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3778                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3779                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3780                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3781
3782                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3783                         {
3784                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3785                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3786                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3787                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3788
3789                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3790                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3791                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3792                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3793
3794                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3795                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3796                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3797                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3798
3799                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3800                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3801                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3802                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3803
3804                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3805                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3806
3807                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3808                                 {
3809                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3810                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3811                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3812                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3813                                 }
3814                         }
3815                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3816                         {
3817                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3818                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3819                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3820                                 {
3821                                         float f = 1.0f / 256.0f;
3822                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3823                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3824                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3825                                 }
3826                         }
3827                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3828                         {
3829                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3830                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3831                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3832                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3833
3834                                 LightColor[0] = 1.0;
3835                                 LightColor[1] = 1.0;
3836                                 LightColor[2] = 1.0;
3837                         }
3838                         else
3839                         {
3840                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
3841                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
3842                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
3843                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3844                         }
3845
3846                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
3847
3848                         if(thread->shader_exactspecularmath)
3849                         {
3850                                 // reflect lightnormal at surfacenormal, take the negative of that
3851                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
3852                                 float f;
3853                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
3854                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
3855                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
3856                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
3857
3858                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
3859                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3860                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3861                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3862                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3863
3864                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3865                         }
3866                         else
3867                         {
3868                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3869                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3870                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3871                                 DPSOFTRAST_Vector3Normalize(eyenormal);
3872
3873                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
3874                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
3875                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
3876                                 DPSOFTRAST_Vector3Normalize(specularnormal);
3877
3878                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
3879                         }
3880                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
3881
3882                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
3883                         {
3884                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3885                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3886                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3887                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3888                         }
3889                         else
3890                         {
3891                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0] + (diffusetex[0] * Color_Diffuse[0] * diffuse + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]);if (d[0] > 255) d[0] = 255;
3892                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1] + (diffusetex[1] * Color_Diffuse[1] * diffuse + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]);if (d[1] > 255) d[1] = 255;
3893                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2] + (diffusetex[2] * Color_Diffuse[2] * diffuse + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]);if (d[2] > 255) d[2] = 255;
3894                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
3895                         }
3896
3897                         buffer_FragColorbgra8[x*4+0] = d[0];
3898                         buffer_FragColorbgra8[x*4+1] = d[1];
3899                         buffer_FragColorbgra8[x*4+2] = d[2];
3900                         buffer_FragColorbgra8[x*4+3] = d[3];
3901                 }
3902         }
3903         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
3904         {
3905                 Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
3906                 Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
3907                 Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
3908                 Color_Diffuse[3] = 0.0f;
3909                 LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
3910                 LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
3911                 LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
3912                 LightColor[3] = 0.0f;
3913                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
3914
3915                 if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3916                 {
3917                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorSdata, VectorSslope, DPSOFTRAST_ARRAY_TEXCOORD1);
3918                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorTdata, VectorTslope, DPSOFTRAST_ARRAY_TEXCOORD2);
3919                         DPSOFTRAST_CALCATTRIB4F(triangle, span, VectorRdata, VectorRslope, DPSOFTRAST_ARRAY_TEXCOORD3);
3920                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3921                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3922                 }
3923                 else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3924                 {
3925                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_lightmapbgra8, GL20TU_LIGHTMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3926                         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_deluxemapbgra8, GL20TU_DELUXEMAP, DPSOFTRAST_ARRAY_TEXCOORD4, buffer_z);
3927                 }
3928                 else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3929                 {
3930                         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
3931                 }
3932                 else
3933                 {
3934                         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD5);
3935                 }
3936
3937                 for (x = startx;x < endx;x++)
3938                 {
3939                         z = buffer_z[x];
3940                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
3941                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
3942                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
3943                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
3944                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3945                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3946                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3947                         DPSOFTRAST_Vector3Normalize(surfacenormal);
3948
3949                         if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_MODELSPACE)
3950                         {
3951                                 // myhalf3 lightnormal_modelspace = myhalf3(dp_texture2D(Texture_Deluxemap, TexCoordSurfaceLightmap.zw)) * 2.0 + myhalf3(-1.0, -1.0, -1.0);\n";
3952                                 lightnormal_modelspace[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3953                                 lightnormal_modelspace[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3954                                 lightnormal_modelspace[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3955
3956                                 // lightnormal.x = dot(lightnormal_modelspace, myhalf3(VectorS));\n"
3957                                 lightnormal[0] = lightnormal_modelspace[0] * (VectorSdata[0] + VectorSslope[0] * x)
3958                                                + lightnormal_modelspace[1] * (VectorSdata[1] + VectorSslope[1] * x)
3959                                                + lightnormal_modelspace[2] * (VectorSdata[2] + VectorSslope[2] * x);
3960
3961                                 // lightnormal.y = dot(lightnormal_modelspace, myhalf3(VectorT));\n"
3962                                 lightnormal[1] = lightnormal_modelspace[0] * (VectorTdata[0] + VectorTslope[0] * x)
3963                                                + lightnormal_modelspace[1] * (VectorTdata[1] + VectorTslope[1] * x)
3964                                                + lightnormal_modelspace[2] * (VectorTdata[2] + VectorTslope[2] * x);
3965
3966                                 // lightnormal.z = dot(lightnormal_modelspace, myhalf3(VectorR));\n"
3967                                 lightnormal[2] = lightnormal_modelspace[0] * (VectorRdata[0] + VectorRslope[0] * x)
3968                                                + lightnormal_modelspace[1] * (VectorRdata[1] + VectorRslope[1] * x)
3969                                                + lightnormal_modelspace[2] * (VectorRdata[2] + VectorRslope[2] * x);
3970
3971                                 // lightnormal = normalize(lightnormal); // VectorS/T/R are not always perfectly normalized, and EXACTSPECULARMATH is very picky about this\n"
3972                                 DPSOFTRAST_Vector3Normalize(lightnormal);
3973
3974                                 // myhalf3 lightcolor = myhalf3(dp_texture2D(Texture_Lightmap, TexCoordSurfaceLightmap.zw));\n";
3975                                 {
3976                                         float f = 1.0f / (256.0f * max(0.25f, lightnormal[2]));
3977                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3978                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3979                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3980                                 }
3981                         }
3982                         else if(thread->shader_mode == SHADERMODE_LIGHTDIRECTIONMAP_TANGENTSPACE)
3983                         {
3984                                 lightnormal[0] = buffer_texture_deluxemapbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
3985                                 lightnormal[1] = buffer_texture_deluxemapbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
3986                                 lightnormal[2] = buffer_texture_deluxemapbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
3987                                 {
3988                                         float f = 1.0f / 256.0f;
3989                                         LightColor[0] = buffer_texture_lightmapbgra8[x*4+0] * f;
3990                                         LightColor[1] = buffer_texture_lightmapbgra8[x*4+1] * f;
3991                                         LightColor[2] = buffer_texture_lightmapbgra8[x*4+2] * f;
3992                                 }
3993                         }
3994                         else if(thread->shader_mode == SHADERMODE_FAKELIGHT)
3995                         {
3996                                 lightnormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
3997                                 lightnormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
3998                                 lightnormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
3999                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4000
4001                                 LightColor[0] = 1.0;
4002                                 LightColor[1] = 1.0;
4003                                 LightColor[2] = 1.0;
4004                         }
4005                         else
4006                         {
4007                                 lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4008                                 lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4009                                 lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4010                                 DPSOFTRAST_Vector3Normalize(lightnormal);
4011                         }
4012
4013                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4014                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4015                         {
4016                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4017                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4018                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4019                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4020                         }
4021                         else
4022                         {
4023                                 d[0] = (int)(                                                + diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse * LightColor[0]));if (d[0] > 255) d[0] = 255;
4024                                 d[1] = (int)(                                                + diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse * LightColor[1]));if (d[1] > 255) d[1] = 255;
4025                                 d[2] = (int)(                                                + diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse * LightColor[2]));if (d[2] > 255) d[2] = 255;
4026                                 d[3] = (int)(                                                  diffusetex[3] * (Color_Ambient[3]                                             ));if (d[3] > 255) d[3] = 255;
4027                         }
4028                         buffer_FragColorbgra8[x*4+0] = d[0];
4029                         buffer_FragColorbgra8[x*4+1] = d[1];
4030                         buffer_FragColorbgra8[x*4+2] = d[2];
4031                         buffer_FragColorbgra8[x*4+3] = d[3];
4032                 }
4033         }
4034         else
4035         {
4036                 for (x = startx;x < endx;x++)
4037                 {
4038                         z = buffer_z[x];
4039                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4040                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4041                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4042                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4043
4044                         if (thread->shader_permutation & SHADERPERMUTATION_GLOW)
4045                         {
4046                                 d[0] = (int)(buffer_texture_glowbgra8[x*4+0] * Color_Glow[0] + diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4047                                 d[1] = (int)(buffer_texture_glowbgra8[x*4+1] * Color_Glow[1] + diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4048                                 d[2] = (int)(buffer_texture_glowbgra8[x*4+2] * Color_Glow[2] + diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4049                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4050                         }
4051                         else
4052                         {
4053                                 d[0] = (int)(                                                  diffusetex[0] * Color_Ambient[0]);if (d[0] > 255) d[0] = 255;
4054                                 d[1] = (int)(                                                  diffusetex[1] * Color_Ambient[1]);if (d[1] > 255) d[1] = 255;
4055                                 d[2] = (int)(                                                  diffusetex[2] * Color_Ambient[2]);if (d[2] > 255) d[2] = 255;
4056                                 d[3] = (int)(                                                  diffusetex[3] * Color_Ambient[3]);if (d[3] > 255) d[3] = 255;
4057                         }
4058                         buffer_FragColorbgra8[x*4+0] = d[0];
4059                         buffer_FragColorbgra8[x*4+1] = d[1];
4060                         buffer_FragColorbgra8[x*4+2] = d[2];
4061                         buffer_FragColorbgra8[x*4+3] = d[3];
4062                 }
4063         }
4064         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4065 }
4066
4067
4068
4069 static void DPSOFTRAST_VertexShader_LightSource(void)
4070 {
4071         int i;
4072         int numvertices = dpsoftrast.numvertices;
4073         float LightPosition[4];
4074         float LightVector[4];
4075         float LightVectorModelSpace[4];
4076         float EyePosition[4];
4077         float EyeVectorModelSpace[4];
4078         float EyeVector[4];
4079         float position[4];
4080         float svector[4];
4081         float tvector[4];
4082         float normal[4];
4083         LightPosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+0];
4084         LightPosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+1];
4085         LightPosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+2];
4086         LightPosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_LightPosition*4+3];
4087         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4088         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4089         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4090         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4091         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4092         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4093         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4094         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4095         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4096         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD4);
4097         for (i = 0;i < numvertices;i++)
4098         {
4099                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4100                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4101                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4102                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4103                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4104                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4105                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4106                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4107                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4108                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4109                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4110                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4111                 LightVectorModelSpace[0] = LightPosition[0] - position[0];
4112                 LightVectorModelSpace[1] = LightPosition[1] - position[1];
4113                 LightVectorModelSpace[2] = LightPosition[2] - position[2];
4114                 LightVector[0] = svector[0] * LightVectorModelSpace[0] + svector[1] * LightVectorModelSpace[1] + svector[2] * LightVectorModelSpace[2];
4115                 LightVector[1] = tvector[0] * LightVectorModelSpace[0] + tvector[1] * LightVectorModelSpace[1] + tvector[2] * LightVectorModelSpace[2];
4116                 LightVector[2] = normal[0]  * LightVectorModelSpace[0] + normal[1]  * LightVectorModelSpace[1] + normal[2]  * LightVectorModelSpace[2];
4117                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0] = LightVector[0];
4118                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1] = LightVector[1];
4119                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2] = LightVector[2];
4120                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+3] = 0.0f;
4121                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4122                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4123                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4124                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4125                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4126                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4127                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0] = EyeVector[0];
4128                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1] = EyeVector[1];
4129                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2] = EyeVector[2];
4130                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+3] = 0.0f;
4131         }
4132         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4133         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelToLightM1);
4134 }
4135
4136 static void DPSOFTRAST_PixelShader_LightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4137 {
4138 #ifdef SSE_POSSIBLE
4139         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4140         unsigned char buffer_texture_colorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4141         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4142         unsigned char buffer_texture_glossbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4143         unsigned char buffer_texture_cubebgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4144         unsigned char buffer_texture_pantsbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4145         unsigned char buffer_texture_shirtbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4146         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4147         int x, startx = span->startx, endx = span->endx;
4148         float Color_Ambient[4], Color_Diffuse[4], Color_Specular[4], Color_Glow[4], Color_Pants[4], Color_Shirt[4], LightColor[4];
4149         float CubeVectordata[4];
4150         float CubeVectorslope[4];
4151         float LightVectordata[4];
4152         float LightVectorslope[4];
4153         float EyeVectordata[4];
4154         float EyeVectorslope[4];
4155         float z;
4156         float diffusetex[4];
4157         float glosstex[4];
4158         float surfacenormal[4];
4159         float lightnormal[4];
4160         float eyenormal[4];
4161         float specularnormal[4];
4162         float diffuse;
4163         float specular;
4164         float SpecularPower;
4165         float CubeVector[4];
4166         float attenuation;
4167         int d[4];
4168         Color_Glow[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+0];
4169         Color_Glow[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+1];
4170         Color_Glow[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Glow*4+2];
4171         Color_Glow[3] = 0.0f;
4172         Color_Ambient[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+0];
4173         Color_Ambient[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+1];
4174         Color_Ambient[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Ambient*4+2];
4175         Color_Ambient[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_Alpha*4+0];
4176         Color_Diffuse[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+0];
4177         Color_Diffuse[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+1];
4178         Color_Diffuse[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Diffuse*4+2];
4179         Color_Diffuse[3] = 0.0f;
4180         Color_Specular[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+0];
4181         Color_Specular[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+1];
4182         Color_Specular[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Specular*4+2];
4183         Color_Specular[3] = 0.0f;
4184         Color_Pants[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+0];
4185         Color_Pants[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+1];
4186         Color_Pants[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Pants*4+2];
4187         Color_Pants[3] = 0.0f;
4188         Color_Shirt[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+0];
4189         Color_Shirt[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+1];
4190         Color_Shirt[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_Color_Shirt*4+2];
4191         Color_Shirt[3] = 0.0f;
4192         LightColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+0];
4193         LightColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+1];
4194         LightColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_LightColor*4+2];
4195         LightColor[3] = 0.0f;
4196         SpecularPower = thread->uniform4f[DPSOFTRAST_UNIFORM_SpecularPower*4+0] * (1.0f / 255.0f);
4197         DPSOFTRAST_CALCATTRIB4F(triangle, span, LightVectordata, LightVectorslope, DPSOFTRAST_ARRAY_TEXCOORD1);
4198         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD2);
4199         DPSOFTRAST_CALCATTRIB4F(triangle, span, CubeVectordata, CubeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD3);
4200         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4201         memset(buffer_FragColorbgra8 + startx*4, 0, (endx-startx)*4); // clear first, because we skip writing black pixels, and there are a LOT of them...
4202         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_colorbgra8, GL20TU_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4203         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4204         {
4205                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_pantsbgra8, GL20TU_PANTS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4206                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_shirtbgra8, GL20TU_SHIRT, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4207         }
4208         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4209                 DPSOFTRAST_Draw_Span_TextureCubeVaryingBGRA8(triangle, span, buffer_texture_cubebgra8, GL20TU_CUBE, DPSOFTRAST_ARRAY_TEXCOORD3, buffer_z);
4210         if (thread->shader_permutation & SHADERPERMUTATION_SPECULAR)
4211         {
4212                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4213                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_glossbgra8, GL20TU_GLOSS, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4214                 for (x = startx;x < endx;x++)
4215                 {
4216                         z = buffer_z[x];
4217                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4218                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4219                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4220                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4221                         if (attenuation < 0.01f)
4222                                 continue;
4223                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4224                         {
4225                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4226                                 if (attenuation < 0.01f)
4227                                         continue;
4228                         }
4229
4230                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4231                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4232                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4233                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4234                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4235                         {
4236                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4237                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4238                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4239                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4240                         }
4241                         glosstex[0] = buffer_texture_glossbgra8[x*4+0];
4242                         glosstex[1] = buffer_texture_glossbgra8[x*4+1];
4243                         glosstex[2] = buffer_texture_glossbgra8[x*4+2];
4244                         glosstex[3] = buffer_texture_glossbgra8[x*4+3];
4245                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4246                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4247                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4248                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4249
4250                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4251                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4252                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4253                         DPSOFTRAST_Vector3Normalize(lightnormal);
4254
4255                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4256
4257                         if(thread->shader_exactspecularmath)
4258                         {
4259                                 // reflect lightnormal at surfacenormal, take the negative of that
4260                                 // i.e. we want (2*dot(N, i) * N - I) for N=surfacenormal, I=lightnormal
4261                                 float f;
4262                                 f = DPSOFTRAST_Vector3Dot(lightnormal, surfacenormal);
4263                                 specularnormal[0] = 2*f*surfacenormal[0] - lightnormal[0];
4264                                 specularnormal[1] = 2*f*surfacenormal[1] - lightnormal[1];
4265                                 specularnormal[2] = 2*f*surfacenormal[2] - lightnormal[2];
4266
4267                                 // dot of this and normalize(EyeVectorFogDepth.xyz)
4268                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4269                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4270                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4271                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4272
4273                                 specular = DPSOFTRAST_Vector3Dot(eyenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4274                         }
4275                         else
4276                         {
4277                                 eyenormal[0] = (EyeVectordata[0] + EyeVectorslope[0]*x) * z;
4278                                 eyenormal[1] = (EyeVectordata[1] + EyeVectorslope[1]*x) * z;
4279                                 eyenormal[2] = (EyeVectordata[2] + EyeVectorslope[2]*x) * z;
4280                                 DPSOFTRAST_Vector3Normalize(eyenormal);
4281
4282                                 specularnormal[0] = lightnormal[0] + eyenormal[0];
4283                                 specularnormal[1] = lightnormal[1] + eyenormal[1];
4284                                 specularnormal[2] = lightnormal[2] + eyenormal[2];
4285                                 DPSOFTRAST_Vector3Normalize(specularnormal);
4286
4287                                 specular = DPSOFTRAST_Vector3Dot(surfacenormal, specularnormal);if (specular < 0.0f) specular = 0.0f;
4288                         }
4289                         specular = pow(specular, 1.0f + SpecularPower * glosstex[3]);
4290
4291                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4292                         {
4293                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4294                                 attenuation *= (1.0f / 255.0f);
4295                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4296                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4297                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4298                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4299                         }
4300                         else
4301                         {
4302                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse) + glosstex[0] * Color_Specular[0] * specular) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4303                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse) + glosstex[1] * Color_Specular[1] * specular) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4304                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse) + glosstex[2] * Color_Specular[2] * specular) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4305                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4306                         }
4307                         buffer_FragColorbgra8[x*4+0] = d[0];
4308                         buffer_FragColorbgra8[x*4+1] = d[1];
4309                         buffer_FragColorbgra8[x*4+2] = d[2];
4310                         buffer_FragColorbgra8[x*4+3] = d[3];
4311                 }
4312         }
4313         else if (thread->shader_permutation & SHADERPERMUTATION_DIFFUSE)
4314         {
4315                 DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4316                 for (x = startx;x < endx;x++)
4317                 {
4318                         z = buffer_z[x];
4319                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4320                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4321                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4322                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4323                         if (attenuation < 0.01f)
4324                                 continue;
4325                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4326                         {
4327                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4328                                 if (attenuation < 0.01f)
4329                                         continue;
4330                         }
4331
4332                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4333                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4334                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4335                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4336                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4337                         {
4338                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4339                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4340                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4341                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4342                         }
4343                         surfacenormal[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4344                         surfacenormal[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4345                         surfacenormal[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4346                         DPSOFTRAST_Vector3Normalize(surfacenormal);
4347
4348                         lightnormal[0] = (LightVectordata[0] + LightVectorslope[0]*x) * z;
4349                         lightnormal[1] = (LightVectordata[1] + LightVectorslope[1]*x) * z;
4350                         lightnormal[2] = (LightVectordata[2] + LightVectorslope[2]*x) * z;
4351                         DPSOFTRAST_Vector3Normalize(lightnormal);
4352
4353                         diffuse = DPSOFTRAST_Vector3Dot(surfacenormal, lightnormal);if (diffuse < 0.0f) diffuse = 0.0f;
4354                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4355                         {
4356                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4357                                 attenuation *= (1.0f / 255.0f);
4358                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4359                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4360                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4361                                 d[3] = (int)( diffusetex[3]                                                                                                                   );if (d[3] > 255) d[3] = 255;
4362                         }
4363                         else
4364                         {
4365                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0] + Color_Diffuse[0] * diffuse)) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4366                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1] + Color_Diffuse[1] * diffuse)) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4367                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2] + Color_Diffuse[2] * diffuse)) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4368                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4369                         }
4370                         buffer_FragColorbgra8[x*4+0] = d[0];
4371                         buffer_FragColorbgra8[x*4+1] = d[1];
4372                         buffer_FragColorbgra8[x*4+2] = d[2];
4373                         buffer_FragColorbgra8[x*4+3] = d[3];
4374                 }
4375         }
4376         else
4377         {
4378                 for (x = startx;x < endx;x++)
4379                 {
4380                         z = buffer_z[x];
4381                         CubeVector[0] = (CubeVectordata[0] + CubeVectorslope[0]*x) * z;
4382                         CubeVector[1] = (CubeVectordata[1] + CubeVectorslope[1]*x) * z;
4383                         CubeVector[2] = (CubeVectordata[2] + CubeVectorslope[2]*x) * z;
4384                         attenuation = 1.0f - DPSOFTRAST_Vector3LengthSquared(CubeVector);
4385                         if (attenuation < 0.01f)
4386                                 continue;
4387                         if (thread->shader_permutation & SHADERPERMUTATION_SHADOWMAP2D)
4388                         {
4389                                 attenuation *= DPSOFTRAST_SampleShadowmap(CubeVector);
4390                                 if (attenuation < 0.01f)
4391                                         continue;
4392                         }
4393
4394                         diffusetex[0] = buffer_texture_colorbgra8[x*4+0];
4395                         diffusetex[1] = buffer_texture_colorbgra8[x*4+1];
4396                         diffusetex[2] = buffer_texture_colorbgra8[x*4+2];
4397                         diffusetex[3] = buffer_texture_colorbgra8[x*4+3];
4398                         if (thread->shader_permutation & SHADERPERMUTATION_COLORMAPPING)
4399                         {
4400                                 diffusetex[0] += buffer_texture_pantsbgra8[x*4+0] * Color_Pants[0] + buffer_texture_shirtbgra8[x*4+0] * Color_Shirt[0];
4401                                 diffusetex[1] += buffer_texture_pantsbgra8[x*4+1] * Color_Pants[1] + buffer_texture_shirtbgra8[x*4+1] * Color_Shirt[1];
4402                                 diffusetex[2] += buffer_texture_pantsbgra8[x*4+2] * Color_Pants[2] + buffer_texture_shirtbgra8[x*4+2] * Color_Shirt[2];
4403                                 diffusetex[3] += buffer_texture_pantsbgra8[x*4+3] * Color_Pants[3] + buffer_texture_shirtbgra8[x*4+3] * Color_Shirt[3];
4404                         }
4405                         if (thread->shader_permutation & SHADERPERMUTATION_CUBEFILTER)
4406                         {
4407                                 // scale down the attenuation to account for the cubefilter multiplying everything by 255
4408                                 attenuation *= (1.0f / 255.0f);
4409                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0] * buffer_texture_cubebgra8[x*4+0] * attenuation);if (d[0] > 255) d[0] = 255;
4410                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1] * buffer_texture_cubebgra8[x*4+1] * attenuation);if (d[1] > 255) d[1] = 255;
4411                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2] * buffer_texture_cubebgra8[x*4+2] * attenuation);if (d[2] > 255) d[2] = 255;
4412                                 d[3] = (int)( diffusetex[3]                                                                                      );if (d[3] > 255) d[3] = 255;
4413                         }
4414                         else
4415                         {
4416                                 d[0] = (int)((diffusetex[0] * (Color_Ambient[0])) * LightColor[0]                                   * attenuation);if (d[0] > 255) d[0] = 255;
4417                                 d[1] = (int)((diffusetex[1] * (Color_Ambient[1])) * LightColor[1]                                   * attenuation);if (d[1] > 255) d[1] = 255;
4418                                 d[2] = (int)((diffusetex[2] * (Color_Ambient[2])) * LightColor[2]                                   * attenuation);if (d[2] > 255) d[2] = 255;
4419                                 d[3] = (int)( diffusetex[3]                                                                                                                                                                );if (d[3] > 255) d[3] = 255;
4420                         }
4421                         buffer_FragColorbgra8[x*4+0] = d[0];
4422                         buffer_FragColorbgra8[x*4+1] = d[1];
4423                         buffer_FragColorbgra8[x*4+2] = d[2];
4424                         buffer_FragColorbgra8[x*4+3] = d[3];
4425                 }
4426         }
4427         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4428 #endif
4429 }
4430
4431
4432
4433 static void DPSOFTRAST_VertexShader_Refraction(void)
4434 {
4435         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4436         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4437         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4438 }
4439
4440 static void DPSOFTRAST_PixelShader_Refraction(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4441 {
4442         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4443         float z;
4444         int x, startx = span->startx, endx = span->endx;
4445
4446         // texture reads
4447         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4448         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4449
4450         // varyings
4451         float ModelViewProjectionPositiondata[4];
4452         float ModelViewProjectionPositionslope[4];
4453
4454         // uniforms
4455         float ScreenScaleRefractReflect[2];
4456         float ScreenCenterRefractReflect[2];
4457         float DistortScaleRefractReflect[2];
4458         float RefractColor[4];
4459
4460         DPSOFTRAST_Texture *texture = thread->texbound[GL20TU_REFRACTION];
4461         if(!texture) return;
4462
4463         // read textures
4464         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4465         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4466
4467         // read varyings
4468         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4469
4470         // read uniforms
4471         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4472         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4473         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4474         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4475         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4476         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4477         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4478         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4479         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4480         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4481
4482         // do stuff
4483         for (x = startx;x < endx;x++)
4484         {
4485                 float SafeScreenTexCoord[2];
4486                 float ScreenTexCoord[2];
4487                 float v[3];
4488                 float iw;
4489                 unsigned char c[4];
4490
4491                 z = buffer_z[x];
4492
4493                 // "    vec2 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect.xy * (1.0 / ModelViewProjectionPosition.w);\n"
4494                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4495
4496                 // "    vec2 SafeScreenTexCoord = ModelViewProjectionPosition.xy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect.xy;\n"
4497                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4498                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4499
4500                 // "    vec2 ScreenTexCoord = SafeScreenTexCoord + vec3(normalize(myhalf3(dp_texture2D(Texture_Normal, TexCoord)) - myhalf3(0.5))).xy * DistortScaleRefractReflect.zw;\n"
4501                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4502                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4503                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4504                 DPSOFTRAST_Vector3Normalize(v);
4505                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4506                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4507
4508                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4509                 DPSOFTRAST_Texture2DBGRA8(texture, 0, ScreenTexCoord[0], ScreenTexCoord[1], c);
4510
4511                 buffer_FragColorbgra8[x*4+0] = c[0] * RefractColor[0];
4512                 buffer_FragColorbgra8[x*4+1] = c[1] * RefractColor[1];
4513                 buffer_FragColorbgra8[x*4+2] = c[2] * RefractColor[2];
4514                 buffer_FragColorbgra8[x*4+3] = min(RefractColor[3] * 256, 255);
4515         }
4516
4517         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4518 }
4519
4520
4521
4522 static void DPSOFTRAST_VertexShader_Water(void)
4523 {
4524         int i;
4525         int numvertices = dpsoftrast.numvertices;
4526         float EyePosition[4];
4527         float EyeVectorModelSpace[4];
4528         float EyeVector[4];
4529         float position[4];
4530         float svector[4];
4531         float tvector[4];
4532         float normal[4];
4533         EyePosition[0] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+0];
4534         EyePosition[1] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+1];
4535         EyePosition[2] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+2];
4536         EyePosition[3] = dpsoftrast.uniform4f[DPSOFTRAST_UNIFORM_EyePosition*4+3];
4537         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION);
4538         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD1);
4539         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD2);
4540         DPSOFTRAST_Array_Load(DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD3);
4541         for (i = 0;i < numvertices;i++)
4542         {
4543                 position[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+0];
4544                 position[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+1];
4545                 position[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION][i*4+2];
4546                 svector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+0];
4547                 svector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+1];
4548                 svector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD1][i*4+2];
4549                 tvector[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+0];
4550                 tvector[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+1];
4551                 tvector[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD2][i*4+2];
4552                 normal[0] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+0];
4553                 normal[1] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+1];
4554                 normal[2] = dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD3][i*4+2];
4555                 EyeVectorModelSpace[0] = EyePosition[0] - position[0];
4556                 EyeVectorModelSpace[1] = EyePosition[1] - position[1];
4557                 EyeVectorModelSpace[2] = EyePosition[2] - position[2];
4558                 EyeVector[0] = svector[0] * EyeVectorModelSpace[0] + svector[1] * EyeVectorModelSpace[1] + svector[2] * EyeVectorModelSpace[2];
4559                 EyeVector[1] = tvector[0] * EyeVectorModelSpace[0] + tvector[1] * EyeVectorModelSpace[1] + tvector[2] * EyeVectorModelSpace[2];
4560                 EyeVector[2] = normal[0]  * EyeVectorModelSpace[0] + normal[1]  * EyeVectorModelSpace[1] + normal[2]  * EyeVectorModelSpace[2];
4561                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+0] = EyeVector[0];
4562                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+1] = EyeVector[1];
4563                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+2] = EyeVector[2];
4564                 dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_TEXCOORD6][i*4+3] = 0.0f;
4565         }
4566         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, -1, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4567         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4568         DPSOFTRAST_Array_Transform(DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD0, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_TexMatrixM1);
4569 }
4570
4571
4572 static void DPSOFTRAST_PixelShader_Water(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4573 {
4574         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4575         float z;
4576         int x, startx = span->startx, endx = span->endx;
4577
4578         // texture reads
4579         unsigned char buffer_texture_normalbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4580         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4581
4582         // varyings
4583         float ModelViewProjectionPositiondata[4];
4584         float ModelViewProjectionPositionslope[4];
4585         float EyeVectordata[4];
4586         float EyeVectorslope[4];
4587
4588         // uniforms
4589         float ScreenScaleRefractReflect[4];
4590         float ScreenCenterRefractReflect[4];
4591         float DistortScaleRefractReflect[4];
4592         float RefractColor[4];
4593         float ReflectColor[4];
4594         float ReflectFactor;
4595         float ReflectOffset;
4596
4597         DPSOFTRAST_Texture *texture_refraction = thread->texbound[GL20TU_REFRACTION];
4598         DPSOFTRAST_Texture *texture_reflection = thread->texbound[GL20TU_REFLECTION];
4599         if(!texture_refraction || !texture_reflection) return;
4600
4601         // read textures
4602         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4603         DPSOFTRAST_Draw_Span_Texture2DVaryingBGRA8(thread, triangle, span, buffer_texture_normalbgra8, GL20TU_NORMAL, DPSOFTRAST_ARRAY_TEXCOORD0, buffer_z);
4604
4605         // read varyings
4606         DPSOFTRAST_CALCATTRIB4F(triangle, span, ModelViewProjectionPositiondata, ModelViewProjectionPositionslope, DPSOFTRAST_ARRAY_TEXCOORD4);
4607         DPSOFTRAST_CALCATTRIB4F(triangle, span, EyeVectordata, EyeVectorslope, DPSOFTRAST_ARRAY_TEXCOORD6);
4608
4609         // read uniforms
4610         ScreenScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+0];
4611         ScreenScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+1];
4612         ScreenScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+2];
4613         ScreenScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenScaleRefractReflect*4+3];
4614         ScreenCenterRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+0];
4615         ScreenCenterRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+1];
4616         ScreenCenterRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+2];
4617         ScreenCenterRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ScreenCenterRefractReflect*4+3];
4618         DistortScaleRefractReflect[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+0];
4619         DistortScaleRefractReflect[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+1];
4620         DistortScaleRefractReflect[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+2];
4621         DistortScaleRefractReflect[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_DistortScaleRefractReflect*4+3];
4622         RefractColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+2];
4623         RefractColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+1];
4624         RefractColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+0];
4625         RefractColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_RefractColor*4+3];
4626         ReflectColor[0] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+2];
4627         ReflectColor[1] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+1];
4628         ReflectColor[2] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+0];
4629         ReflectColor[3] = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectColor*4+3];
4630         ReflectFactor = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectFactor*4+0];
4631         ReflectOffset = thread->uniform4f[DPSOFTRAST_UNIFORM_ReflectOffset*4+0];
4632
4633         // do stuff
4634         for (x = startx;x < endx;x++)
4635         {
4636                 float SafeScreenTexCoord[4];
4637                 float ScreenTexCoord[4];
4638                 float v[3];
4639                 float iw;
4640                 unsigned char c1[4];
4641                 unsigned char c2[4];
4642                 float Fresnel;
4643
4644                 z = buffer_z[x];
4645
4646                 // "    vec4 ScreenScaleRefractReflectIW = ScreenScaleRefractReflect * (1.0 / ModelViewProjectionPosition.w);\n"
4647                 iw = 1.0f / (ModelViewProjectionPositiondata[3] + ModelViewProjectionPositionslope[3]*x); // / z
4648
4649                 // "    vec4 SafeScreenTexCoord = ModelViewProjectionPosition.xyxy * ScreenScaleRefractReflectIW + ScreenCenterRefractReflect;\n"
4650                 SafeScreenTexCoord[0] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[0] + ScreenCenterRefractReflect[0]; // * z (disappears)
4651                 SafeScreenTexCoord[1] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[1] + ScreenCenterRefractReflect[1]; // * z (disappears)
4652                 SafeScreenTexCoord[2] = (ModelViewProjectionPositiondata[0] + ModelViewProjectionPositionslope[0]*x) * iw * ScreenScaleRefractReflect[2] + ScreenCenterRefractReflect[2]; // * z (disappears)
4653                 SafeScreenTexCoord[3] = (ModelViewProjectionPositiondata[1] + ModelViewProjectionPositionslope[1]*x) * iw * ScreenScaleRefractReflect[3] + ScreenCenterRefractReflect[3]; // * z (disappears)
4654
4655                 // "    vec4 ScreenTexCoord = SafeScreenTexCoord + vec2(normalize(vec3(dp_texture2D(Texture_Normal, TexCoord)) - vec3(0.5))).xyxy * DistortScaleRefractReflect;\n"
4656                 v[0] = buffer_texture_normalbgra8[x*4+2] * (1.0f / 128.0f) - 1.0f;
4657                 v[1] = buffer_texture_normalbgra8[x*4+1] * (1.0f / 128.0f) - 1.0f;
4658                 v[2] = buffer_texture_normalbgra8[x*4+0] * (1.0f / 128.0f) - 1.0f;
4659                 DPSOFTRAST_Vector3Normalize(v);
4660                 ScreenTexCoord[0] = SafeScreenTexCoord[0] + v[0] * DistortScaleRefractReflect[0];
4661                 ScreenTexCoord[1] = SafeScreenTexCoord[1] + v[1] * DistortScaleRefractReflect[1];
4662                 ScreenTexCoord[2] = SafeScreenTexCoord[2] + v[0] * DistortScaleRefractReflect[2];
4663                 ScreenTexCoord[3] = SafeScreenTexCoord[3] + v[1] * DistortScaleRefractReflect[3];
4664
4665                 // "    float Fresnel = pow(min(1.0, 1.0 - float(normalize(EyeVector).z)), 2.0) * ReflectFactor + ReflectOffset;\n"
4666                 v[0] = (EyeVectordata[0] + EyeVectorslope[0] * x); // * z (disappears)
4667                 v[1] = (EyeVectordata[1] + EyeVectorslope[1] * x); // * z (disappears)
4668                 v[2] = (EyeVectordata[2] + EyeVectorslope[2] * x); // * z (disappears)
4669                 DPSOFTRAST_Vector3Normalize(v);
4670                 Fresnel = 1.0f - v[2];
4671                 Fresnel = min(1.0f, Fresnel);
4672                 Fresnel = Fresnel * Fresnel * ReflectFactor + ReflectOffset;
4673
4674                 // "    dp_FragColor = vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord).rgb, 1.0) * RefractColor;\n"
4675                 // "    dp_FragColor = mix(vec4(dp_texture2D(Texture_Refraction, ScreenTexCoord.xy).rgb, 1) * RefractColor, vec4(dp_texture2D(Texture_Reflection, ScreenTexCoord.zw).rgb, 1) * ReflectColor, Fresnel);\n"
4676                 DPSOFTRAST_Texture2DBGRA8(texture_refraction, 0, ScreenTexCoord[0], ScreenTexCoord[1], c1);
4677                 DPSOFTRAST_Texture2DBGRA8(texture_reflection, 0, ScreenTexCoord[2], ScreenTexCoord[3], c2);
4678
4679                 buffer_FragColorbgra8[x*4+0] = (c1[0] * RefractColor[0]) * (1.0f - Fresnel) + (c2[0] * ReflectColor[0]) * Fresnel;
4680                 buffer_FragColorbgra8[x*4+1] = (c1[1] * RefractColor[1]) * (1.0f - Fresnel) + (c2[1] * ReflectColor[1]) * Fresnel;
4681                 buffer_FragColorbgra8[x*4+2] = (c1[2] * RefractColor[2]) * (1.0f - Fresnel) + (c2[2] * ReflectColor[2]) * Fresnel;
4682                 buffer_FragColorbgra8[x*4+3] = min((    RefractColor[3] *  (1.0f - Fresnel) +          ReflectColor[3]  * Fresnel) * 256, 255);
4683         }
4684
4685         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4686 }
4687
4688
4689
4690 static void DPSOFTRAST_VertexShader_ShowDepth(void)
4691 {
4692         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4693 }
4694
4695 static void DPSOFTRAST_PixelShader_ShowDepth(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4696 {
4697         // TODO: IMPLEMENT
4698         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4699         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4700         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4701         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4702         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4703 }
4704
4705
4706
4707 static void DPSOFTRAST_VertexShader_DeferredGeometry(void)
4708 {
4709         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4710 }
4711
4712 static void DPSOFTRAST_PixelShader_DeferredGeometry(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4713 {
4714         // TODO: IMPLEMENT
4715         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4716         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4717         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4718         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4719         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4720 }
4721
4722
4723
4724 static void DPSOFTRAST_VertexShader_DeferredLightSource(void)
4725 {
4726         DPSOFTRAST_Array_TransformProject(DPSOFTRAST_ARRAY_POSITION, DPSOFTRAST_ARRAY_POSITION, dpsoftrast.uniform4f + 4*DPSOFTRAST_UNIFORM_ModelViewProjectionMatrixM1);
4727 }
4728
4729 static void DPSOFTRAST_PixelShader_DeferredLightSource(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span)
4730 {
4731         // TODO: IMPLEMENT
4732         float buffer_z[DPSOFTRAST_DRAW_MAXSPANLENGTH];
4733         unsigned char buffer_FragColorbgra8[DPSOFTRAST_DRAW_MAXSPANLENGTH*4];
4734         DPSOFTRAST_Draw_Span_Begin(thread, triangle, span, buffer_z);
4735         memset(buffer_FragColorbgra8 + span->startx*4, 0, (span->endx - span->startx)*4);
4736         DPSOFTRAST_Draw_Span_FinishBGRA8(thread, triangle, span, buffer_FragColorbgra8);
4737 }
4738
4739
4740
4741 typedef struct DPSOFTRAST_ShaderModeInfo_s
4742 {
4743         int lodarrayindex;
4744         void (*Vertex)(void);
4745         void (*Span)(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Triangle * RESTRICT triangle, const DPSOFTRAST_State_Span * RESTRICT span);
4746         unsigned char arrays[DPSOFTRAST_ARRAY_TOTAL];
4747         unsigned char texunits[DPSOFTRAST_MAXTEXTUREUNITS];
4748 }
4749 DPSOFTRAST_ShaderModeInfo;
4750
4751 static const DPSOFTRAST_ShaderModeInfo DPSOFTRAST_ShaderModeTable[SHADERMODE_COUNT] =
4752 {
4753         {2, DPSOFTRAST_VertexShader_Generic,                        DPSOFTRAST_PixelShader_Generic,                        {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4754         {2, DPSOFTRAST_VertexShader_PostProcess,                    DPSOFTRAST_PixelShader_PostProcess,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, ~0}, {GL20TU_FIRST, GL20TU_SECOND, ~0}},
4755         {2, DPSOFTRAST_VertexShader_Depth_Or_Shadow,                DPSOFTRAST_PixelShader_Depth_Or_Shadow,                {~0}, {~0}},
4756         {2, DPSOFTRAST_VertexShader_FlatColor,                      DPSOFTRAST_PixelShader_FlatColor,                      {DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4757         {2, DPSOFTRAST_VertexShader_VertexColor,                    DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4758         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4759         {2, DPSOFTRAST_VertexShader_FakeLight,                      DPSOFTRAST_PixelShader_FakeLight,                      {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4760         {2, DPSOFTRAST_VertexShader_LightDirectionMap_ModelSpace,   DPSOFTRAST_PixelShader_LightDirectionMap_ModelSpace,   {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4761         {2, DPSOFTRAST_VertexShader_LightDirectionMap_TangentSpace, DPSOFTRAST_PixelShader_LightDirectionMap_TangentSpace, {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_LIGHTMAP, GL20TU_DELUXEMAP, ~0}},
4762         {2, DPSOFTRAST_VertexShader_Lightmap,                       DPSOFTRAST_PixelShader_Lightmap,                       {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_LIGHTMAP, GL20TU_GLOW, ~0}},
4763         {2, DPSOFTRAST_VertexShader_VertexColor,                        DPSOFTRAST_PixelShader_VertexColor,                    {DPSOFTRAST_ARRAY_COLOR, DPSOFTRAST_ARRAY_TEXCOORD0, ~0}, {GL20TU_COLOR, ~0}},
4764         {2, DPSOFTRAST_VertexShader_LightDirection,                 DPSOFTRAST_PixelShader_LightDirection,                 {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD5, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, ~0}},
4765         {2, DPSOFTRAST_VertexShader_LightSource,                    DPSOFTRAST_PixelShader_LightSource,                    {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_COLOR, GL20TU_PANTS, GL20TU_SHIRT, GL20TU_GLOW, GL20TU_NORMAL, GL20TU_GLOSS, GL20TU_CUBE, ~0}},
4766         {2, DPSOFTRAST_VertexShader_Refraction,                     DPSOFTRAST_PixelShader_Refraction,                     {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD4, ~0}, {GL20TU_NORMAL, GL20TU_REFRACTION, ~0}},
4767         {2, DPSOFTRAST_VertexShader_Water,                          DPSOFTRAST_PixelShader_Water,                          {DPSOFTRAST_ARRAY_TEXCOORD0, DPSOFTRAST_ARRAY_TEXCOORD1, DPSOFTRAST_ARRAY_TEXCOORD2, DPSOFTRAST_ARRAY_TEXCOORD3, DPSOFTRAST_ARRAY_TEXCOORD4, DPSOFTRAST_ARRAY_TEXCOORD6, ~0}, {GL20TU_NORMAL, GL20TU_REFLECTION, GL20TU_REFRACTION, ~0}},
4768         {2, DPSOFTRAST_VertexShader_ShowDepth,                      DPSOFTRAST_PixelShader_ShowDepth,                      {~0}},
4769         {2, DPSOFTRAST_VertexShader_DeferredGeometry,               DPSOFTRAST_PixelShader_DeferredGeometry,               {~0}},
4770         {2, DPSOFTRAST_VertexShader_DeferredLightSource,            DPSOFTRAST_PixelShader_DeferredLightSource,            {~0}},
4771 };
4772
4773 static void DPSOFTRAST_Draw_DepthTest(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_State_Span *span)
4774 {
4775         int x;
4776         int startx;
4777         int endx;
4778         unsigned int *depthpixel;
4779         int depth;
4780         int depthslope;
4781         unsigned int d;
4782         unsigned char *pixelmask;
4783         DPSOFTRAST_State_Triangle *triangle;
4784         triangle = &thread->triangles[span->triangle];
4785         depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4786         startx = span->startx;
4787         endx = span->endx;
4788         depth = span->depthbase;
4789         depthslope = span->depthslope;
4790         pixelmask = thread->pixelmaskarray;
4791         if (thread->depthtest && dpsoftrast.fb_depthpixels)
4792         {
4793                 switch(thread->fb_depthfunc)
4794                 {
4795                 default:
4796                 case GL_ALWAYS:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = true; break;
4797                 case GL_LESS:    for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] < d; break;
4798                 case GL_LEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] <= d; break;
4799                 case GL_EQUAL:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] == d; break;
4800                 case GL_GEQUAL:  for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] >= d; break;
4801                 case GL_GREATER: for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = depthpixel[x] > d; break;
4802                 case GL_NEVER:   for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope) pixelmask[x] = false; break;
4803                 }
4804                 while (startx < endx && !pixelmask[startx])
4805                         startx++;
4806                 while (endx > startx && !pixelmask[endx-1])
4807                         endx--;
4808         }
4809         else
4810         {
4811                 // no depth testing means we're just dealing with color...
4812                 memset(pixelmask + startx, 1, endx - startx);
4813         }
4814         span->pixelmask = pixelmask;
4815         span->startx = startx;
4816         span->endx = endx;
4817 }
4818
4819 static void DPSOFTRAST_Draw_DepthWrite(const DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_State_Span *span)
4820 {
4821         int x, d, depth, depthslope, startx, endx;
4822         const unsigned char *pixelmask;
4823         unsigned int *depthpixel;
4824         if (thread->depthmask && thread->depthtest && dpsoftrast.fb_depthpixels)
4825         {
4826                 depth = span->depthbase;
4827                 depthslope = span->depthslope;
4828                 pixelmask = span->pixelmask;
4829                 startx = span->startx;
4830                 endx = span->endx;
4831                 depthpixel = dpsoftrast.fb_depthpixels + span->y * dpsoftrast.fb_width + span->x;
4832                 for (x = startx, d = depth + depthslope*startx;x < endx;x++, d += depthslope)
4833                         if (pixelmask[x])
4834                                 depthpixel[x] = d;
4835         }
4836 }
4837
4838 static void DPSOFTRAST_Draw_ProcessSpans(DPSOFTRAST_State_Thread *thread)
4839 {
4840         int i;
4841         DPSOFTRAST_State_Triangle *triangle;
4842         DPSOFTRAST_State_Span *span;
4843         for (i = 0; i < thread->numspans; i++)
4844         {
4845                 span = &thread->spans[i];
4846                 triangle = &thread->triangles[span->triangle];
4847                 DPSOFTRAST_Draw_DepthTest(thread, span);
4848                 if (span->startx >= span->endx)
4849                         continue;
4850                 // run pixel shader if appropriate
4851                 // do this before running depthmask code, to allow the pixelshader
4852                 // to clear pixelmask values for alpha testing
4853                 if (dpsoftrast.fb_colorpixels[0] && thread->fb_colormask)
4854                         DPSOFTRAST_ShaderModeTable[thread->shader_mode].Span(thread, triangle, span);
4855                 DPSOFTRAST_Draw_DepthWrite(thread, span);
4856         }
4857         thread->numspans = 0;
4858 }
4859
4860 DEFCOMMAND(22, Draw, int datasize; int starty; int endy; ATOMIC_COUNTER refcount; int clipped; int firstvertex; int numvertices; int numtriangles; float *arrays; int *element3i; unsigned short *element3s;)
4861
4862 static void DPSOFTRAST_Interpret_Draw(DPSOFTRAST_State_Thread *thread, DPSOFTRAST_Command_Draw *command)
4863 {
4864 #ifdef SSE_POSSIBLE
4865         int cullface = thread->cullface;
4866         int minx, maxx, miny, maxy;
4867         int miny1, maxy1, miny2, maxy2;
4868         __m128i fbmin, fbmax;
4869         __m128 viewportcenter, viewportscale;
4870         int firstvertex = command->firstvertex;
4871         int numvertices = command->numvertices;
4872         int numtriangles = command->numtriangles;
4873         const int *element3i = command->element3i;
4874         const unsigned short *element3s = command->element3s;
4875         int clipped = command->clipped;
4876         int i;
4877         int j;
4878         int k;
4879         int y;
4880         int e[3];
4881         __m128i screeny;
4882         int starty, endy, bandy;
4883         int numpoints;
4884         int clipcase;
4885         float clipdist[4];
4886         float clip0origin, clip0slope;
4887         int clip0dir;
4888         __m128 triangleedge1, triangleedge2, trianglenormal;
4889         __m128 clipfrac[3];
4890         __m128 screen[4];
4891         DPSOFTRAST_State_Triangle *triangle;
4892         DPSOFTRAST_Texture *texture;
4893         DPSOFTRAST_ValidateQuick(thread, DPSOFTRAST_VALIDATE_DRAW);
4894         miny = thread->fb_scissor[1];
4895         maxy = thread->fb_scissor[1] + thread->fb_scissor[3];
4896         miny1 = bound(miny, thread->miny1, maxy);
4897         maxy1 = bound(miny, thread->maxy1, maxy);
4898         miny2 = bound(miny, thread->miny2, maxy);
4899         maxy2 = bound(miny, thread->maxy2, maxy);
4900         if ((command->starty >= maxy1 || command->endy <= miny1) && (command->starty >= maxy2 || command->endy <= miny2))
4901         {
4902                 if (!ATOMIC_DECREMENT(command->refcount))
4903                 {
4904                         if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
4905                                 MM_FREE(command->arrays);
4906                 }
4907                 return;
4908         }
4909         minx = thread->fb_scissor[0];
4910         maxx = thread->fb_scissor[0] + thread->fb_scissor[2];
4911         fbmin = _mm_setr_epi16(minx, miny1, minx, miny1, minx, miny1, minx, miny1);
4912         fbmax = _mm_sub_epi16(_mm_setr_epi16(maxx, maxy2, maxx, maxy2, maxx, maxy2, maxx, maxy2), _mm_set1_epi16(1));
4913         viewportcenter = _mm_load_ps(thread->fb_viewportcenter);
4914         viewportscale = _mm_load_ps(thread->fb_viewportscale);
4915         screen[3] = _mm_setzero_ps();
4916         clipfrac[0] = clipfrac[1] = clipfrac[2] = _mm_setzero_ps();
4917         for (i = 0;i < numtriangles;i++)
4918         {
4919                 const float *screencoord4f = command->arrays;
4920                 const float *arrays = screencoord4f + numvertices*4;
4921
4922                 // generate the 3 edges of this triangle
4923                 // generate spans for the triangle - switch based on left split or right split classification of triangle
4924                 if (element3s)
4925                 {
4926                         e[0] = element3s[i*3+0] - firstvertex;
4927                         e[1] = element3s[i*3+1] - firstvertex;
4928                         e[2] = element3s[i*3+2] - firstvertex;
4929                 }
4930                 else if (element3i)
4931                 {
4932                         e[0] = element3i[i*3+0] - firstvertex;
4933                         e[1] = element3i[i*3+1] - firstvertex;
4934                         e[2] = element3i[i*3+2] - firstvertex;
4935                 }
4936                 else
4937                 {
4938                         e[0] = i*3+0;
4939                         e[1] = i*3+1;
4940                         e[2] = i*3+2;
4941                 }
4942
4943 #define SKIPBACKFACE \
4944                 triangleedge1 = _mm_sub_ps(screen[0], screen[1]); \
4945                 triangleedge2 = _mm_sub_ps(screen[2], screen[1]); \
4946                 /* store normal in 2, 0, 1 order instead of 0, 1, 2 as it requires fewer shuffles and leaves z component accessible as scalar */ \
4947                 trianglenormal = _mm_sub_ss(_mm_mul_ss(triangleedge1, _mm_shuffle_ps(triangleedge2, triangleedge2, _MM_SHUFFLE(3, 0, 2, 1))), \
4948                                                                         _mm_mul_ss(_mm_shuffle_ps(triangleedge1, triangleedge1, _MM_SHUFFLE(3, 0, 2, 1)), triangleedge2)); \
4949                 switch(cullface) \
4950                 { \
4951                 case GL_BACK: \
4952                         if (_mm_ucomilt_ss(trianglenormal, _mm_setzero_ps())) \
4953                                 continue; \
4954                         break; \
4955                 case GL_FRONT: \
4956                         if (_mm_ucomigt_ss(trianglenormal, _mm_setzero_ps())) \
4957                                 continue; \
4958                         break; \
4959                 }
4960
4961 #define CLIPPEDVERTEXLERP(k,p1, p2) \
4962                         clipfrac[p1] = _mm_set1_ps(clipdist[p1] / (clipdist[p1] - clipdist[p2])); \
4963                         { \
4964                                 __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4965                                 DPSOFTRAST_PROJECTVERTEX(screen[k], _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])), viewportcenter, viewportscale); \
4966                         }
4967 #define CLIPPEDVERTEXCOPY(k,p1) \
4968                         screen[k] = _mm_load_ps(&screencoord4f[e[p1]*4]);
4969
4970 #define GENATTRIBCOPY(attrib, p1) \
4971                 attrib = _mm_load_ps(&arrays[e[p1]*4]);
4972 #define GENATTRIBLERP(attrib, p1, p2) \
4973                 { \
4974                         __m128 v1 = _mm_load_ps(&arrays[e[p1]*4]), v2 = _mm_load_ps(&arrays[e[p2]*4]); \
4975                         attrib = _mm_add_ps(v1, _mm_mul_ps(_mm_sub_ps(v2, v1), clipfrac[p1])); \
4976                 }
4977 #define GENATTRIBS(attrib0, attrib1, attrib2) \
4978                 switch(clipcase) \
4979                 { \
4980                 default: \
4981                 case 0: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4982                 case 1: GENATTRIBCOPY(attrib0, 0); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4983                 case 2: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4984                 case 3: GENATTRIBCOPY(attrib0, 0); GENATTRIBLERP(attrib1, 0, 1); GENATTRIBLERP(attrib2, 2, 0); break; \
4985                 case 4: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBCOPY(attrib2, 2); break; \
4986                 case 5: GENATTRIBLERP(attrib0, 0, 1); GENATTRIBCOPY(attrib1, 1); GENATTRIBLERP(attrib2, 1, 2); break; \
4987                 case 6: GENATTRIBLERP(attrib0, 1, 2); GENATTRIBCOPY(attrib1, 2); GENATTRIBLERP(attrib2, 2, 0); break; \
4988                 }
4989
4990                 if (! clipped)
4991                         goto notclipped;
4992
4993                 // calculate distance from nearplane
4994                 clipdist[0] = arrays[e[0]*4+2] + arrays[e[0]*4+3];
4995                 clipdist[1] = arrays[e[1]*4+2] + arrays[e[1]*4+3];
4996                 clipdist[2] = arrays[e[2]*4+2] + arrays[e[2]*4+3];
4997                 if (clipdist[0] >= 0.0f)
4998                 {
4999                         if (clipdist[1] >= 0.0f)
5000                         {
5001                                 if (clipdist[2] >= 0.0f)
5002                                 {
5003                                 notclipped:
5004                                         // triangle is entirely in front of nearplane
5005                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2);
5006                                         SKIPBACKFACE;
5007                                         numpoints = 3;
5008                                         clipcase = 0;
5009                                 }
5010                                 else
5011                                 {
5012                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXLERP(3,2,0);
5013                                         SKIPBACKFACE;
5014                                         numpoints = 4;
5015                                         clipcase = 1;
5016                                 }
5017                         }
5018                         else
5019                         {
5020                                 if (clipdist[2] >= 0.0f)
5021                                 {
5022                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,1,2); CLIPPEDVERTEXCOPY(3,2);
5023                                         SKIPBACKFACE;
5024                                         numpoints = 4;
5025                                         clipcase = 2;
5026                                 }
5027                                 else
5028                                 {
5029                                         CLIPPEDVERTEXCOPY(0,0); CLIPPEDVERTEXLERP(1,0,1); CLIPPEDVERTEXLERP(2,2,0);
5030                                         SKIPBACKFACE;
5031                                         numpoints = 3;
5032                                         clipcase = 3;
5033                                 }
5034                         }
5035                 }
5036                 else if (clipdist[1] >= 0.0f)
5037                 {
5038                         if (clipdist[2] >= 0.0f)
5039                         {
5040                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXCOPY(2,2); CLIPPEDVERTEXLERP(3,2,0);
5041                                 SKIPBACKFACE;
5042                                 numpoints = 4;
5043                                 clipcase = 4;
5044                         }
5045                         else
5046                         {
5047                                 CLIPPEDVERTEXLERP(0,0,1); CLIPPEDVERTEXCOPY(1,1); CLIPPEDVERTEXLERP(2,1,2);
5048                                 SKIPBACKFACE;
5049                                 numpoints = 3;
5050                                 clipcase = 5;
5051                         }
5052                 }
5053                 else if (clipdist[2] >= 0.0f)
5054                 {
5055                         CLIPPEDVERTEXLERP(0,1,2); CLIPPEDVERTEXCOPY(1,2); CLIPPEDVERTEXLERP(2,2,0);
5056                         SKIPBACKFACE;
5057                         numpoints = 3;
5058                         clipcase = 6;
5059                 }
5060                 else continue; // triangle is entirely behind nearplane
5061
5062                 {
5063                         // calculate integer y coords for triangle points
5064                         __m128i screeni = _mm_packs_epi32(_mm_cvttps_epi32(_mm_movelh_ps(screen[0], screen[1])), _mm_cvttps_epi32(_mm_movelh_ps(screen[2], numpoints > 3 ? screen[3] : screen[2]))),
5065                                         screenir = _mm_shuffle_epi32(screeni, _MM_SHUFFLE(1, 0, 3, 2)),
5066                                         screenmin = _mm_min_epi16(screeni, screenir),
5067                                         screenmax = _mm_max_epi16(screeni, screenir);
5068                         screenmin = _mm_min_epi16(screenmin, _mm_shufflelo_epi16(screenmin, _MM_SHUFFLE(1, 0, 3, 2)));
5069                         screenmax = _mm_max_epi16(screenmax, _mm_shufflelo_epi16(screenmax, _MM_SHUFFLE(1, 0, 3, 2)));
5070                         screenmin = _mm_max_epi16(screenmin, fbmin);
5071                         screenmax = _mm_min_epi16(screenmax, fbmax);
5072                         // skip offscreen triangles
5073                         if (_mm_cvtsi128_si32(_mm_cmplt_epi16(screenmax, screenmin)))
5074                                 continue;
5075                         starty = _mm_extract_epi16(screenmin, 1);
5076                         endy = _mm_extract_epi16(screenmax, 1)+1;
5077                         if (starty >= maxy1 && endy <= miny2)
5078                                 continue;
5079                         screeny = _mm_srai_epi32(screeni, 16);
5080                 }
5081
5082                 triangle = &thread->triangles[thread->numtriangles];
5083
5084                 // calculate attribute plans for triangle data...
5085                 // okay, this triangle is going to produce spans, we'd better project
5086                 // the interpolants now (this is what gives perspective texturing),
5087                 // this consists of simply multiplying all arrays by the W coord
5088                 // (which is basically 1/Z), which will be undone per-pixel
5089                 // (multiplying by Z again) to get the perspective-correct array
5090                 // values
5091                 {
5092                         __m128 attribuvslope, attribuxslope, attribuyslope, attribvxslope, attribvyslope, attriborigin, attribedge1, attribedge2, attribxslope, attribyslope, w0, w1, w2, x1, y1;
5093                         __m128 mipedgescale, mipdensity;
5094                         attribuvslope = _mm_div_ps(_mm_movelh_ps(triangleedge1, triangleedge2), _mm_shuffle_ps(trianglenormal, trianglenormal, _MM_SHUFFLE(0, 0, 0, 0)));
5095                         attribuxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(3, 3, 3, 3));
5096                         attribuyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(2, 2, 2, 2));
5097                         attribvxslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(1, 1, 1, 1));
5098                         attribvyslope = _mm_shuffle_ps(attribuvslope, attribuvslope, _MM_SHUFFLE(0, 0, 0, 0));
5099                         w0 = _mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(3, 3, 3, 3));
5100                         w1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(3, 3, 3, 3));
5101                         w2 = _mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(3, 3, 3, 3));
5102                         attribedge1 = _mm_sub_ss(w0, w1);
5103                         attribedge2 = _mm_sub_ss(w2, w1);
5104                         attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5105                         attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5106                         x1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(0, 0, 0, 0));
5107                         y1 = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(1, 1, 1, 1));
5108                         attriborigin = _mm_sub_ss(w1, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5109                         _mm_store_ss(&triangle->w[0], attribxslope);
5110                         _mm_store_ss(&triangle->w[1], attribyslope);
5111                         _mm_store_ss(&triangle->w[2], attriborigin);
5112                         
5113                         clip0origin = 0;
5114                         clip0slope = 0;
5115                         clip0dir = 0;
5116                         if(thread->fb_clipplane[0] || thread->fb_clipplane[1] || thread->fb_clipplane[2])
5117                         {
5118                                 float cliporigin, clipxslope, clipyslope;
5119                                 attriborigin = _mm_shuffle_ps(screen[1], screen[1], _MM_SHUFFLE(2, 2, 2, 2));
5120                                 attribedge1 = _mm_sub_ss(_mm_shuffle_ps(screen[0], screen[0], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5121                                 attribedge2 = _mm_sub_ss(_mm_shuffle_ps(screen[2], screen[2], _MM_SHUFFLE(2, 2, 2, 2)), attriborigin);
5122                                 attribxslope = _mm_sub_ss(_mm_mul_ss(attribuxslope, attribedge1), _mm_mul_ss(attribvxslope, attribedge2));
5123                                 attribyslope = _mm_sub_ss(_mm_mul_ss(attribvyslope, attribedge2), _mm_mul_ss(attribuyslope, attribedge1));
5124                                 attriborigin = _mm_sub_ss(attriborigin, _mm_add_ss(_mm_mul_ss(attribxslope, x1), _mm_mul_ss(attribyslope, y1)));
5125                                 cliporigin = _mm_cvtss_f32(attriborigin)*thread->fb_clipplane[2] + thread->fb_clipplane[3];
5126                                 clipxslope = thread->fb_clipplane[0] + _mm_cvtss_f32(attribxslope)*thread->fb_clipplane[2];
5127                                 clipyslope = thread->fb_clipplane[1] + _mm_cvtss_f32(attribyslope)*thread->fb_clipplane[2];
5128                                 if(clipxslope != 0)
5129                                 {
5130                                         clip0origin = -cliporigin/clipxslope;
5131                                         clip0slope = -clipyslope/clipxslope;
5132                                         clip0dir = clipxslope > 0 ? 1 : -1;
5133                                 }
5134                                 else if(clipyslope > 0)
5135                                 {
5136                                         clip0origin = dpsoftrast.fb_width*floor(cliporigin/clipyslope);
5137                                         clip0slope = dpsoftrast.fb_width;
5138                                         clip0dir = -1;
5139                                 }
5140                                 else if(clipyslope < 0)
5141                                 {
5142                                         clip0origin = dpsoftrast.fb_width*ceil(cliporigin/clipyslope);
5143                                         clip0slope = -dpsoftrast.fb_width;
5144                                         clip0dir = -1;
5145                                 }
5146                                 else if(clip0origin < 0) continue;
5147                         }
5148
5149                         mipedgescale = _mm_setzero_ps();
5150                         for (j = 0;j < DPSOFTRAST_ARRAY_TOTAL; j++)
5151                         {
5152                                 __m128 attrib0, attrib1, attrib2;
5153                                 k = DPSOFTRAST_ShaderModeTable[thread->shader_mode].arrays[j];
5154                                 if (k >= DPSOFTRAST_ARRAY_TOTAL)
5155                                         break;
5156                                 arrays += numvertices*4;
5157                                 GENATTRIBS(attrib0, attrib1, attrib2);
5158                                 attriborigin = _mm_mul_ps(attrib1, w1);
5159                                 attribedge1 = _mm_sub_ps(_mm_mul_ps(attrib0, w0), attriborigin);
5160                                 attribedge2 = _mm_sub_ps(_mm_mul_ps(attrib2, w2), attriborigin);
5161                                 attribxslope = _mm_sub_ps(_mm_mul_ps(attribuxslope, attribedge1), _mm_mul_ps(attribvxslope, attribedge2));
5162                                 attribyslope = _mm_sub_ps(_mm_mul_ps(attribvyslope, attribedge2), _mm_mul_ps(attribuyslope, attribedge1));
5163                                 attriborigin = _mm_sub_ps(attriborigin, _mm_add_ps(_mm_mul_ps(attribxslope, x1), _mm_mul_ps(attribyslope, y1)));
5164                                 _mm_storeu_ps(triangle->attribs[k][0], attribxslope);
5165                                 _mm_storeu_ps(triangle->attribs[k][1], attribyslope);
5166                                 _mm_storeu_ps(triangle->attribs[k][2], attriborigin);
5167                                 if (k == DPSOFTRAST_ShaderModeTable[thread->shader_mode].lodarrayindex)
5168                                 {
5169                                         mipedgescale = _mm_movelh_ps(triangleedge1, triangleedge2);
5170                                         mipedgescale = _mm_mul_ps(mipedgescale, mipedgescale);
5171                                         mipedgescale = _mm_rsqrt_ps(_mm_add_ps(mipedgescale, _mm_shuffle_ps(mipedgescale, mipedgescale, _MM_SHUFFLE(2, 3, 0, 1))));
5172                                         mipedgescale = _mm_mul_ps(_mm_sub_ps(_mm_movelh_ps(attrib0, attrib2), _mm_movelh_ps(attrib1, attrib1)), mipedgescale);
5173                                 }
5174                         }
5175
5176                         memset(triangle->mip, 0, sizeof(triangle->mip));
5177                         for (j = 0;j < DPSOFTRAST_MAXTEXTUREUNITS;j++)
5178                         {
5179                                 int texunit = DPSOFTRAST_ShaderModeTable[thread->shader_mode].texunits[j];
5180                                 if (texunit >= DPSOFTRAST_MAXTEXTUREUNITS)
5181                                         break;
5182                                 texture = thread->texbound[texunit];
5183                                 if (texture && texture->filter > DPSOFTRAST_TEXTURE_FILTER_LINEAR)
5184                                 {
5185                                         mipdensity = _mm_mul_ps(mipedgescale, _mm_cvtepi32_ps(_mm_shuffle_epi32(_mm_loadl_epi64((const __m128i *)&texture->mipmap[0][2]), _MM_SHUFFLE(1, 0, 1, 0))));
5186                                         mipdensity = _mm_mul_ps(mipdensity, mipdensity);
5187                                         mipdensity = _mm_add_ps(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 3, 0, 1)));
5188                                         mipdensity = _mm_min_ss(mipdensity, _mm_shuffle_ps(mipdensity, mipdensity, _MM_SHUFFLE(2, 2, 2, 2)));
5189                                         // this will be multiplied in the texturing routine by the texture resolution
5190                                         y = _mm_cvtss_si32(mipdensity);
5191                                         if (y > 0)
5192                                         {
5193                                                 y = (int)(log((float)y)*0.5f/M_LN2);
5194                                                 if (y > texture->mipmaps - 1)
5195                                                         y = texture->mipmaps - 1;
5196                                                 triangle->mip[texunit] = y;
5197                                         }
5198                                 }
5199                         }
5200                 }
5201         
5202                 for (y = starty, bandy = min(endy, maxy1); y < endy; bandy = min(endy, maxy2), y = max(y, miny2))
5203                 for (; y < bandy;)
5204                 {
5205                         __m128 xcoords, xslope;
5206                         __m128i ycc = _mm_cmpgt_epi32(_mm_set1_epi32(y), screeny);
5207                         int yccmask = _mm_movemask_epi8(ycc);
5208                         int edge0p, edge0n, edge1p, edge1n;
5209                         int nexty;
5210                         float w, wslope;
5211                         float clip0;
5212                         if (numpoints == 4)
5213                         {
5214                                 switch(yccmask)
5215                                 {
5216                                 default:
5217                                 case 0xFFFF: /*0000*/ y = endy; continue;
5218                                 case 0xFFF0: /*1000*/ edge0p = 3;edge0n = 0;edge1p = 1;edge1n = 0;break;
5219                                 case 0xFF0F: /*0100*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5220                                 case 0xFF00: /*1100*/ edge0p = 3;edge0n = 0;edge1p = 2;edge1n = 1;break;
5221                                 case 0xF0FF: /*0010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break;
5222                                 case 0xF0F0: /*1010*/ edge0p = 1;edge0n = 2;edge1p = 3;edge1n = 2;break; // concave - nonsense
5223                                 case 0xF00F: /*0110*/ edge0p = 0;edge0n = 1;edge1p = 3;edge1n = 2;break;
5224                                 case 0xF000: /*1110*/ edge0p = 3;edge0n = 0;edge1p = 3;edge1n = 2;break;
5225                                 case 0x0FFF: /*0001*/ edge0p = 2;edge0n = 3;edge1p = 0;edge1n = 3;break;
5226                                 case 0x0FF0: /*1001*/ edge0p = 2;edge0n = 3;edge1p = 1;edge1n = 0;break;
5227                                 case 0x0F0F: /*0101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break; // concave - nonsense
5228                                 case 0x0F00: /*1101*/ edge0p = 2;edge0n = 3;edge1p = 2;edge1n = 1;break;
5229                                 case 0x00FF: /*0011*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 3;break;
5230                                 case 0x00F0: /*1011*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5231                                 case 0x000F: /*0111*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 3;break;
5232                                 case 0x0000: /*1111*/ y++; continue;
5233                                 }
5234                         }
5235                         else
5236                         {
5237                                 switch(yccmask)
5238                                 {
5239                                 default:
5240                                 case 0xFFFF: /*000*/ y = endy; continue;
5241                                 case 0xFFF0: /*100*/ edge0p = 2;edge0n = 0;edge1p = 1;edge1n = 0;break;
5242                                 case 0xFF0F: /*010*/ edge0p = 0;edge0n = 1;edge1p = 2;edge1n = 1;break;
5243                                 case 0xFF00: /*110*/ edge0p = 2;edge0n = 0;edge1p = 2;edge1n = 1;break;
5244                                 case 0x00FF: /*001*/ edge0p = 1;edge0n = 2;edge1p = 0;edge1n = 2;break;
5245                                 case 0x00F0: /*101*/ edge0p = 1;edge0n = 2;edge1p = 1;edge1n = 0;break;
5246                                 case 0x000F: /*011*/ edge0p = 0;edge0n = 1;edge1p = 0;edge1n = 2;break;
5247                                 case 0x0000: /*111*/ y++; continue;
5248                                 }
5249                         }
5250                         ycc = _mm_max_epi16(_mm_srli_epi16(ycc, 1), screeny);
5251                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(1, 0, 3, 2)));
5252                         ycc = _mm_min_epi16(ycc, _mm_shuffle_epi32(ycc, _MM_SHUFFLE(2, 3, 0, 1)));
5253                         nexty = _mm_extract_epi16(ycc, 0);
5254                         if (nexty >= bandy) nexty = bandy-1;
5255                         xslope = _mm_sub_ps(_mm_movelh_ps(screen[edge0n], screen[edge1n]), _mm_movelh_ps(screen[edge0p], screen[edge1p]));
5256                         xslope = _mm_div_ps(xslope, _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(3, 3, 1, 1)));
5257                         xcoords = _mm_add_ps(_mm_movelh_ps(screen[edge0p], screen[edge1p]),
5258                                                                 _mm_mul_ps(xslope, _mm_sub_ps(_mm_set1_ps(y), _mm_shuffle_ps(screen[edge0p], screen[edge1p], _MM_SHUFFLE(1, 1, 1, 1)))));
5259                         xcoords = _mm_add_ps(xcoords, _mm_set1_ps(0.5f));
5260                         if (_mm_ucomigt_ss(xcoords, _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2))))
5261                         {
5262                                 xcoords = _mm_shuffle_ps(xcoords, xcoords, _MM_SHUFFLE(1, 0, 3, 2));
5263                                 xslope = _mm_shuffle_ps(xslope, xslope, _MM_SHUFFLE(1, 0, 3, 2));
5264                         }
5265                         clip0 = clip0origin + (y+0.5f)*clip0slope + 0.5f;
5266                         for(; y <= nexty; y++, xcoords = _mm_add_ps(xcoords, xslope), clip0 += clip0slope)
5267                         {
5268                                 int startx, endx, offset;
5269                                 startx = _mm_cvtss_si32(xcoords);
5270                                 endx = _mm_cvtss_si32(_mm_movehl_ps(xcoords, xcoords));
5271                                 if (startx < minx) startx = minx;
5272                                 if (endx > maxx) endx = maxx;
5273                                 if (startx >= endx) continue;
5274
5275                                 if (clip0dir)
5276                                 {
5277                                         if (clip0dir > 0)
5278                                         {
5279                                                 if (startx < clip0) 
5280                                                 {
5281                                                         if(endx <= clip0) continue;
5282                                                         startx = (int)clip0;
5283                                                 }
5284                                         }
5285                                         else if (endx > clip0) 
5286                                         {
5287                                                 if(startx >= clip0) continue;
5288                                                 endx = (int)clip0;
5289                                         }
5290                                 }
5291                                                 
5292                                 for (offset = startx; offset < endx;offset += DPSOFTRAST_DRAW_MAXSPANLENGTH)
5293                                 {
5294                                         DPSOFTRAST_State_Span *span = &thread->spans[thread->numspans];
5295                                         span->triangle = thread->numtriangles;
5296                                         span->x = offset;
5297                                         span->y = y;
5298                                         span->startx = 0;
5299                                         span->endx = min(endx - offset, DPSOFTRAST_DRAW_MAXSPANLENGTH);
5300                                         if (span->startx >= span->endx)
5301                                                 continue;
5302                                         wslope = triangle->w[0];
5303                                         w = triangle->w[2] + span->x*wslope + span->y*triangle->w[1];
5304                                         span->depthslope = (int)(wslope*DPSOFTRAST_DEPTHSCALE);
5305                                         span->depthbase = (int)(w*DPSOFTRAST_DEPTHSCALE - DPSOFTRAST_DEPTHOFFSET*(thread->polygonoffset[1] + fabs(wslope)*thread->polygonoffset[0]));
5306                                         if (++thread->numspans >= DPSOFTRAST_DRAW_MAXSPANS)
5307                                                 DPSOFTRAST_Draw_ProcessSpans(thread);
5308                                 }
5309                         }
5310                 }
5311
5312                 if (++thread->numtriangles >= DPSOFTRAST_DRAW_MAXTRIANGLES)
5313                 {
5314                         DPSOFTRAST_Draw_ProcessSpans(thread);
5315                         thread->numtriangles = 0;
5316                 }
5317         }
5318
5319         if (!ATOMIC_DECREMENT(command->refcount))
5320         {
5321                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5322                         MM_FREE(command->arrays);
5323         }
5324
5325         if (thread->numspans > 0 || thread->numtriangles > 0)
5326         {
5327                 DPSOFTRAST_Draw_ProcessSpans(thread);
5328                 thread->numtriangles = 0;
5329         }
5330 #endif
5331 }
5332
5333 static DPSOFTRAST_Command_Draw *DPSOFTRAST_Draw_AllocateDrawCommand(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5334 {
5335         int i;
5336         int j;
5337         int commandsize = DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw));
5338         int datasize = 2*numvertices*sizeof(float[4]);
5339         DPSOFTRAST_Command_Draw *command;
5340         unsigned char *data;
5341         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5342         {
5343                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5344                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5345                         break;
5346                 datasize += numvertices*sizeof(float[4]);
5347         }
5348         if (element3s)
5349                 datasize += numtriangles*sizeof(unsigned short[3]);
5350         else if (element3i)
5351                 datasize += numtriangles*sizeof(int[3]);
5352         datasize = DPSOFTRAST_ALIGNCOMMAND(datasize);
5353         if (commandsize + datasize > DPSOFTRAST_DRAW_MAXCOMMANDSIZE)
5354         {
5355                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize);
5356                 data = (unsigned char *)MM_CALLOC(datasize, 1);
5357         }
5358         else
5359         {
5360                 command = (DPSOFTRAST_Command_Draw *) DPSOFTRAST_AllocateCommand(DPSOFTRAST_OPCODE_Draw, commandsize + datasize);
5361                 data = (unsigned char *)command + commandsize;
5362         }
5363         command->firstvertex = firstvertex;
5364         command->numvertices = numvertices;
5365         command->numtriangles = numtriangles;
5366         command->arrays = (float *)data;
5367         memset(dpsoftrast.post_array4f, 0, sizeof(dpsoftrast.post_array4f));
5368         dpsoftrast.firstvertex = firstvertex;
5369         dpsoftrast.numvertices = numvertices;
5370         dpsoftrast.screencoord4f = (float *)data;
5371         data += numvertices*sizeof(float[4]);
5372         dpsoftrast.post_array4f[DPSOFTRAST_ARRAY_POSITION] = (float *)data;
5373         data += numvertices*sizeof(float[4]);
5374         for (i = 0; i < DPSOFTRAST_ARRAY_TOTAL; i++)
5375         {
5376                 j = DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].arrays[i];
5377                 if (j >= DPSOFTRAST_ARRAY_TOTAL)
5378                         break;
5379                 dpsoftrast.post_array4f[j] = (float *)data;
5380                 data += numvertices*sizeof(float[4]);
5381         }
5382         command->element3i = NULL;
5383         command->element3s = NULL;
5384         if (element3s)
5385         {
5386                 command->element3s = (unsigned short *)data;
5387                 memcpy(command->element3s, element3s, numtriangles*sizeof(unsigned short[3]));
5388         }
5389         else if (element3i)
5390         {
5391                 command->element3i = (int *)data;
5392                 memcpy(command->element3i, element3i, numtriangles*sizeof(int[3]));
5393         }
5394         return command;
5395 }
5396
5397 void DPSOFTRAST_DrawTriangles(int firstvertex, int numvertices, int numtriangles, const int *element3i, const unsigned short *element3s)
5398 {
5399         DPSOFTRAST_Command_Draw *command = DPSOFTRAST_Draw_AllocateDrawCommand(firstvertex, numvertices, numtriangles, element3i, element3s);
5400         DPSOFTRAST_ShaderModeTable[dpsoftrast.shader_mode].Vertex();
5401         command->starty = bound(0, dpsoftrast.drawstarty, dpsoftrast.fb_height);
5402         command->endy = bound(0, dpsoftrast.drawendy, dpsoftrast.fb_height);
5403         if (command->starty >= command->endy)
5404         {
5405                 if (command->commandsize <= DPSOFTRAST_ALIGNCOMMAND(sizeof(DPSOFTRAST_Command_Draw)))
5406                         MM_FREE(command->arrays);
5407                 DPSOFTRAST_UndoCommand(command->commandsize);
5408                 return;
5409         }
5410         command->clipped = dpsoftrast.drawclipped;
5411         command->refcount = dpsoftrast.numthreads;
5412
5413         if (dpsoftrast.usethreads)
5414         {
5415                 int i;
5416                 DPSOFTRAST_Draw_SyncCommands();
5417                 for (i = 0; i < dpsoftrast.numthreads; i++)
5418                 {
5419                         DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5420                         if (((command->starty < thread->maxy1 && command->endy > thread->miny1) || (command->starty < thread->maxy2 && command->endy > thread->miny2)) && thread->starving)
5421                                 Thread_CondSignal(thread->drawcond);
5422                 }
5423         }
5424         else
5425         {
5426                 DPSOFTRAST_Draw_FlushThreads();
5427         }
5428 }
5429
5430 DEFCOMMAND(23, SetRenderTargets, int width; int height;)
5431 static void DPSOFTRAST_Interpret_SetRenderTargets(DPSOFTRAST_State_Thread *thread, const DPSOFTRAST_Command_SetRenderTargets *command)
5432 {
5433         thread->validate |= DPSOFTRAST_VALIDATE_FB;
5434 }
5435 void DPSOFTRAST_SetRenderTargets(int width, int height, unsigned int *depthpixels, unsigned int *colorpixels0, unsigned int *colorpixels1, unsigned int *colorpixels2, unsigned int *colorpixels3)
5436 {
5437         DPSOFTRAST_Command_SetRenderTargets *command;
5438         if (width != dpsoftrast.fb_width || height != dpsoftrast.fb_height || depthpixels != dpsoftrast.fb_depthpixels ||
5439                 colorpixels0 != dpsoftrast.fb_colorpixels[0] || colorpixels1 != dpsoftrast.fb_colorpixels[1] ||
5440                 colorpixels2 != dpsoftrast.fb_colorpixels[2] || colorpixels3 != dpsoftrast.fb_colorpixels[3])
5441                 DPSOFTRAST_Flush();
5442         dpsoftrast.fb_width = width;
5443         dpsoftrast.fb_height = height;
5444         dpsoftrast.fb_depthpixels = depthpixels;
5445         dpsoftrast.fb_colorpixels[0] = colorpixels0;
5446         dpsoftrast.fb_colorpixels[1] = colorpixels1;
5447         dpsoftrast.fb_colorpixels[2] = colorpixels2;
5448         dpsoftrast.fb_colorpixels[3] = colorpixels3;
5449         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5450         command = DPSOFTRAST_ALLOCATECOMMAND(SetRenderTargets);
5451         command->width = width;
5452         command->height = height;
5453 }
5454  
5455 static void DPSOFTRAST_Draw_InterpretCommands(DPSOFTRAST_State_Thread *thread, int endoffset)
5456 {
5457         int commandoffset = thread->commandoffset;
5458         while (commandoffset != endoffset)
5459         {
5460                 DPSOFTRAST_Command *command = (DPSOFTRAST_Command *)&dpsoftrast.commandpool.commands[commandoffset];
5461                 switch (command->opcode)
5462                 {
5463 #define INTERPCOMMAND(name) \
5464                 case DPSOFTRAST_OPCODE_##name : \
5465                         DPSOFTRAST_Interpret_##name (thread, (DPSOFTRAST_Command_##name *)command); \
5466                         commandoffset += DPSOFTRAST_ALIGNCOMMAND(sizeof( DPSOFTRAST_Command_##name )); \
5467                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL) \
5468                                 commandoffset = 0; \
5469                         break;
5470                 INTERPCOMMAND(Viewport)
5471                 INTERPCOMMAND(ClearColor)
5472                 INTERPCOMMAND(ClearDepth)
5473                 INTERPCOMMAND(ColorMask)
5474                 INTERPCOMMAND(DepthTest)
5475                 INTERPCOMMAND(ScissorTest)
5476                 INTERPCOMMAND(Scissor)
5477                 INTERPCOMMAND(BlendFunc)
5478                 INTERPCOMMAND(BlendSubtract)
5479                 INTERPCOMMAND(DepthMask)
5480                 INTERPCOMMAND(DepthFunc)
5481                 INTERPCOMMAND(DepthRange)
5482                 INTERPCOMMAND(PolygonOffset)
5483                 INTERPCOMMAND(CullFace)
5484                 INTERPCOMMAND(SetTexture)
5485                 INTERPCOMMAND(SetShader)
5486                 INTERPCOMMAND(Uniform4f)
5487                 INTERPCOMMAND(UniformMatrix4f)
5488                 INTERPCOMMAND(Uniform1i)
5489                 INTERPCOMMAND(SetRenderTargets)
5490                 INTERPCOMMAND(ClipPlane)
5491
5492                 case DPSOFTRAST_OPCODE_Draw:
5493                         DPSOFTRAST_Interpret_Draw(thread, (DPSOFTRAST_Command_Draw *)command);
5494                         commandoffset += command->commandsize;
5495                         if (commandoffset >= DPSOFTRAST_DRAW_MAXCOMMANDPOOL)
5496                                 commandoffset = 0;
5497                         thread->commandoffset = commandoffset;
5498                         break;
5499
5500                 case DPSOFTRAST_OPCODE_Reset:
5501                         commandoffset = 0;
5502                         break;
5503                 }
5504         }
5505         thread->commandoffset = commandoffset;
5506 }
5507
5508 static int DPSOFTRAST_Draw_Thread(void *data)
5509 {
5510         DPSOFTRAST_State_Thread *thread = (DPSOFTRAST_State_Thread *)data;
5511         while(thread->index >= 0)
5512         {
5513                 if (thread->commandoffset != dpsoftrast.drawcommand)
5514                 {
5515                         DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);      
5516                 }
5517                 else 
5518                 {
5519                         Thread_LockMutex(thread->drawmutex);
5520                         if (thread->commandoffset == dpsoftrast.drawcommand && thread->index >= 0)
5521                         {
5522                                 if (thread->waiting) Thread_CondSignal(thread->waitcond);
5523                                 thread->starving = true;
5524                                 Thread_CondWait(thread->drawcond, thread->drawmutex);
5525                                 thread->starving = false;
5526                         }
5527                         Thread_UnlockMutex(thread->drawmutex);
5528                 }
5529         }   
5530         return 0;
5531 }
5532
5533 static void DPSOFTRAST_Draw_FlushThreads(void)
5534 {
5535         DPSOFTRAST_State_Thread *thread;
5536         int i;
5537         DPSOFTRAST_Draw_SyncCommands();
5538         if (dpsoftrast.usethreads) 
5539         {
5540                 for (i = 0; i < dpsoftrast.numthreads; i++)
5541                 {
5542                         thread = &dpsoftrast.threads[i];
5543                         if (thread->commandoffset != dpsoftrast.drawcommand)
5544                         {
5545                                 Thread_LockMutex(thread->drawmutex);
5546                                 if (thread->commandoffset != dpsoftrast.drawcommand && thread->starving)
5547                                         Thread_CondSignal(thread->drawcond);
5548                                 Thread_UnlockMutex(thread->drawmutex);
5549                         }
5550                 }
5551                 for (i = 0; i < dpsoftrast.numthreads; i++)
5552                 {
5553                         thread = &dpsoftrast.threads[i];
5554                         if (thread->commandoffset != dpsoftrast.drawcommand)
5555                         {
5556                                 Thread_LockMutex(thread->drawmutex);
5557                                 if (thread->commandoffset != dpsoftrast.drawcommand)
5558                                 {
5559                                         thread->waiting = true;
5560                                         Thread_CondWait(thread->waitcond, thread->drawmutex);
5561                                         thread->waiting = false;
5562                                 }
5563                                 Thread_UnlockMutex(thread->drawmutex);
5564                         }
5565                 }
5566         }
5567         else
5568         {
5569                 for (i = 0; i < dpsoftrast.numthreads; i++)
5570                 {
5571                         thread = &dpsoftrast.threads[i];
5572                         if (thread->commandoffset != dpsoftrast.drawcommand)
5573                                 DPSOFTRAST_Draw_InterpretCommands(thread, dpsoftrast.drawcommand);
5574                 }
5575         }
5576         dpsoftrast.commandpool.usedcommands = 0;
5577 }
5578
5579 void DPSOFTRAST_Flush(void)
5580 {
5581         DPSOFTRAST_Draw_FlushThreads();
5582 }
5583
5584 void DPSOFTRAST_Finish(void)
5585 {
5586         DPSOFTRAST_Flush();
5587 }
5588
5589 int DPSOFTRAST_Init(int width, int height, int numthreads, int interlace, unsigned int *colorpixels, unsigned int *depthpixels)
5590 {
5591         int i;
5592         union
5593         {
5594                 int i;
5595                 unsigned char b[4];
5596         }
5597         u;
5598         u.i = 1;
5599         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5600         dpsoftrast.bigendian = u.b[3];
5601         dpsoftrast.fb_width = width;
5602         dpsoftrast.fb_height = height;
5603         dpsoftrast.fb_depthpixels = depthpixels;
5604         dpsoftrast.fb_colorpixels[0] = colorpixels;
5605         dpsoftrast.fb_colorpixels[1] = NULL;
5606         dpsoftrast.fb_colorpixels[1] = NULL;
5607         dpsoftrast.fb_colorpixels[1] = NULL;
5608         dpsoftrast.viewport[0] = 0;
5609         dpsoftrast.viewport[1] = 0;
5610         dpsoftrast.viewport[2] = dpsoftrast.fb_width;
5611         dpsoftrast.viewport[3] = dpsoftrast.fb_height;
5612         DPSOFTRAST_RecalcViewport(dpsoftrast.viewport, dpsoftrast.fb_viewportcenter, dpsoftrast.fb_viewportscale);
5613         dpsoftrast.texture_firstfree = 1;
5614         dpsoftrast.texture_end = 1;
5615         dpsoftrast.texture_max = 0;
5616         dpsoftrast.color[0] = 1;
5617         dpsoftrast.color[1] = 1;
5618         dpsoftrast.color[2] = 1;
5619         dpsoftrast.color[3] = 1;
5620         dpsoftrast.usethreads = numthreads > 0 && Thread_HasThreads();
5621         dpsoftrast.interlace = dpsoftrast.usethreads ? bound(0, interlace, 1) : 0;
5622         dpsoftrast.numthreads = dpsoftrast.usethreads ? bound(1, numthreads, 64) : 1;
5623         dpsoftrast.threads = (DPSOFTRAST_State_Thread *)MM_CALLOC(dpsoftrast.numthreads, sizeof(DPSOFTRAST_State_Thread));
5624         for (i = 0; i < dpsoftrast.numthreads; i++)
5625         {
5626                 DPSOFTRAST_State_Thread *thread = &dpsoftrast.threads[i];
5627                 thread->index = i;
5628                 thread->cullface = GL_BACK;
5629         thread->colormask[0] = 1; 
5630                 thread->colormask[1] = 1;
5631                 thread->colormask[2] = 1;
5632                 thread->colormask[3] = 1;
5633                 thread->blendfunc[0] = GL_ONE;
5634                 thread->blendfunc[1] = GL_ZERO;
5635                 thread->depthmask = true;
5636                 thread->depthtest = true;
5637                 thread->depthfunc = GL_LEQUAL;
5638                 thread->scissortest = false;
5639                 thread->viewport[0] = 0;
5640                 thread->viewport[1] = 0;
5641                 thread->viewport[2] = dpsoftrast.fb_width;
5642                 thread->viewport[3] = dpsoftrast.fb_height;
5643                 thread->scissor[0] = 0;
5644                 thread->scissor[1] = 0;
5645                 thread->scissor[2] = dpsoftrast.fb_width;
5646                 thread->scissor[3] = dpsoftrast.fb_height;
5647                 thread->depthrange[0] = 0;
5648                 thread->depthrange[1] = 1;
5649                 thread->polygonoffset[0] = 0;
5650                 thread->polygonoffset[1] = 0;
5651                 thread->clipplane[0] = 0;
5652                 thread->clipplane[1] = 0;
5653                 thread->clipplane[2] = 0;
5654                 thread->clipplane[3] = 1;
5655         
5656                 thread->numspans = 0;
5657                 thread->numtriangles = 0;
5658                 thread->commandoffset = 0;
5659                 thread->waiting = false;
5660                 thread->starving = false;
5661            
5662                 thread->validate = -1;
5663                 DPSOFTRAST_Validate(thread, -1);
5664  
5665                 if (dpsoftrast.usethreads)
5666                 {
5667                         thread->waitcond = Thread_CreateCond();
5668                         thread->drawcond = Thread_CreateCond();
5669                         thread->drawmutex = Thread_CreateMutex();
5670                         thread->thread = Thread_CreateThread(DPSOFTRAST_Draw_Thread, thread);
5671                 }
5672         }
5673         return 0;
5674 }
5675
5676 void DPSOFTRAST_Shutdown(void)
5677 {
5678         int i;
5679         if (dpsoftrast.usethreads && dpsoftrast.numthreads > 0)
5680         {
5681                 DPSOFTRAST_State_Thread *thread;
5682                 for (i = 0; i < dpsoftrast.numthreads; i++)
5683                 {
5684                         thread = &dpsoftrast.threads[i];
5685                         Thread_LockMutex(thread->drawmutex);
5686                         thread->index = -1;
5687                         Thread_CondSignal(thread->drawcond);
5688                         Thread_UnlockMutex(thread->drawmutex);
5689                         Thread_WaitThread(thread->thread, 0);
5690                         Thread_DestroyCond(thread->waitcond);
5691                         Thread_DestroyCond(thread->drawcond);
5692                         Thread_DestroyMutex(thread->drawmutex);
5693                 }
5694         }
5695         for (i = 0;i < dpsoftrast.texture_end;i++)
5696                 if (dpsoftrast.texture[i].bytes)
5697                         MM_FREE(dpsoftrast.texture[i].bytes);
5698         if (dpsoftrast.texture)
5699                 free(dpsoftrast.texture);
5700         if (dpsoftrast.threads)
5701                 MM_FREE(dpsoftrast.threads);
5702         memset(&dpsoftrast, 0, sizeof(dpsoftrast));
5703 }
5704